-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
53 lines (43 loc) · 1.46 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import json
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import gensim.downloader
def process_fake_news():
filename = "fake-news.csv"
df = pd.read_csv(filename)
return df[["text", "label"]]
def process_unlabelled_data():
filename = "political-bias.csv"
df = pd.read_csv(filename)
return df[["text"]]
def get_glove_feature(df):
features = []
glove = gensim.downloader.load('glove-wiki-gigaword-200')
for i, row in df.iterrows():
text = row["text"]
text = text.lower()
words = word_tokenize(text)
feature = []
for word in words:
if word in glove:
feature.append(glove[word])
features.append(feature)
features = np.array(features)
return features
def split(df):
random_state = 42
df_train, df_test = train_test_split(df, train_size=0.75, random_state=random_state)
# input for transformer or RNN is an array of the glove embedding of each words in the text to be analyzed
# def big_news():
# with open('FILL IN FILE NAME(S)') as fp:
# train = [json.loads(line) for line in fp]
def process():
df = process_fake_news()
#df = big_news()
unlab = process_unlabelled_data()
df,unlab = get_glove_feature(df), get_glove_feature(unlab)
train_label, test = split(df)
train_unlabel = unlab
return train_label, train_unlabel, test