-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_corpus.py
133 lines (97 loc) · 4.26 KB
/
create_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import csv
import numpy as np
import pandas as pd
from random import sample
# hyper parameters
max_char_n = 20
n_examples = 100000
# output
output_dir = 'data/'
charset_file = '{}charset.csv'.format(output_dir)
dataset_file = '{}dataset.csv'.format(output_dir)
# Functions
# Transforms a csv into an array of song lyrics (None, 1)
def csvToSongLyricsArray(csv):
# filter for only lyrics from the 1990s, of the pop genre, and not instrumentals
mask = (csv['year'] > 1989) & (csv['year'] < 2000) & (csv['genre'] == 'Pop') & (csv['lyrics'] != '[Instrumental]')
filtered = csv[mask]
# remove null values
nonNull = filtered.dropna()
# trim all the extra data. We only want the lyrics
lyrics = nonNull['lyrics']
# reindex the lyrics to make it easier to work with
reindexed = lyrics.reset_index(drop=True)
# lowercase the lyrics
lowercased = reindexed[:].str.lower()
# get the number of song lyrics
n_songs = lowercased.shape[0]
return lowercased, n_songs
# filter out any song where lyrics contain a character outside the chars set
def filterLyrics(charset, lyrics):
filtered_lyrics = []
# for each song
for lyric in lyrics:
check = 0
# split the lyric into an array of characters
lyric_chars = list(lyric)
# for each character, check if it's not in the chars set
for char in lyric_chars:
if char not in charset:
check = 1
# if all character are in the chars set
# add it to our filter lyrics list
if check == 0:
filtered_lyrics.append(lyric_chars)
# get the number of lyrics
n_filtered_lyrics = len(filtered_lyrics)
return filtered_lyrics, n_filtered_lyrics
# flatten the previous into a list of song lyrics lines
def flatten_lyrics(lyrics):
flattened_lyrics = [line for song in lyrics for line in song]
n_chars = len(flattened_lyrics)
return flattened_lyrics, n_chars
def generateDataset(lyrics, n_chars, n_examples, max_char_n):
dataset = []
max_index = n_chars - max_char_n
start_indices = np.random.randint(0, max_index, size=n_examples)
for start_index in start_indices:
end_index = start_index + max_char_n
example = lyrics[start_index:end_index]
start_index = end_index
dataset.append(example)
return dataset
# Extract and Transform Raw Dataset
# load raw data file as a dataframe
raw_data = pd.read_csv('data/raw.csv')
# get formatted_lyrics and number of songs
lyrics, n_lyrics = csvToSongLyricsArray(raw_data)
lyrics.head(10)
# examine the number of song lyrics we have
print("Number of lyrics: {}".format(n_lyrics))
print("Lyric Example: {}".format(lyrics[0]))
# Filter out non-english lyrics
charset = ["'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 'z', '\n', '!', '"', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', ' ']
# determine number of charecters in our set
n_charset = len(charset)
print("Number of characters in chars: {}".format(n_charset))
# filter out any song where lyrics contain a character outside the english set
filtered_lyrics, n_filtered_lyrics = filterLyrics(charset, lyrics)
print("Number of english songs: {}".format(n_filtered_lyrics))
print("A english song lyric: {}".format(filtered_lyrics[0]))
# flatten english song lyrics
flattened_lyrics, n_chars = flatten_lyrics(filtered_lyrics)
print("Number of song lyrics characters: {}".format(n_chars))
print("Section of song lyrics: {}".format(flattened_lyrics[0:100]))
# Extract the subset we are interested in
# generate n_examples example of max_char_n length
dataset = generateDataset(flattened_lyrics, n_chars, n_examples, max_char_n)
print("Number of examples in dataset: {}".format(len(dataset)))
print("Example: {}".format(dataset[0]))
# Export datasets
# save charset
with open(charset_file, 'w', newline='') as csvFile:
file = csv.writer(csvFile, delimiter=',')
file.writerows(charset)
# save dataset
with open(dataset_file, 'w', newline='') as csvFile:
file = csv.writer(csvFile, delimiter=',')