-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_generator.py
151 lines (128 loc) · 6.28 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import math
import random
import numpy as np
from classification_helper import load_csv, save_csv
class DataGen:
def __init__(self) -> None:
pass
def gen_gfx_suffix(data):
# Create a view of the first column as a string array
first_column = data[:, 0].astype(str)
# Initialize an empty list for the new first column
new_data = []
unique_values = set()
for s in first_column:
if "gfx" not in s:
new_value = s + "_gfx"
if new_value not in unique_values:
unique_values.add(new_value)
new_data.append([new_value, '1']) # Make sure '1' is a string
return np.array(new_data)
def randomize_numbers(data):
new_data = []
for i in range(len(data)):
unique_values = set()
for j in range(10):
old_d = data[i, 0].replace(f"{j}", f"{random.randint(0, 9)}")
new_d = data[i, 0].replace(f"{j}", f"{random.randint(0, 9)}")
if old_d not in unique_values:
unique_values.add(old_d)
new_data.append([old_d, data[i, 1]])
if new_d not in unique_values:
unique_values.add(new_d)
new_data.append([new_d, data[i, 1]])
new_data = np.array(new_data)
return new_data
def gen_new_dim_inverse(data):
if data.shape[1] != 2:
raise ValueError("Input matrix must have 2 columns")
complementary_column = 1 - data[:, 1].astype(int) # Convert the column to int before subtraction
result_matrix = np.column_stack((data, complementary_column))
return result_matrix
def convert_column_to_int(data, column_index):
data[:, column_index] = data[:, column_index].astype(int)
return data
def gen_gfx_prefix(data):
# Create a view of the first column as a string array
first_column = data[:, 0].astype(str)
# Initialize an empty list for the new first column
new_data = []
unique_values = set()
for s in first_column:
if "gfx" not in s:
new_value = "gfx_" + s
if new_value not in unique_values:
unique_values.add(new_value)
new_data.append([new_value, '1']) # Make sure '1' is a string
return np.array(new_data)
def full_name_gen(file_path_names, file_path_surnames, amount, gen_trailing_numbers = False, insert = ''):
surnames = load_csv(file_path_surnames)
names = load_csv(file_path_names)
picked_names = []
for i in range(amount):
name = f"{random.choice(names)[0]}{insert}{random.choice(surnames)[0]}"
if(gen_trailing_numbers):
picked_names.append([name, 1])
for _ in range(random.randint(0, 4)):
name += f"{random.randint(0, 9)}"
picked_names.append([name, 1])
else:
picked_names.append([name, 1])
return np.array(picked_names)
def name_gen(file_path_names, amount, gen_trailing_numbers = True):
names = load_csv(file_path_names)
picked_names = []
for i in range(amount):
name = f"{random.choice(names)[0]}"
if(gen_trailing_numbers):
for _ in range(random.randint(2, 5)):
name += f"{random.randint(0, 9)}"
picked_names.append([name, 1])
else:
picked_names.append([name, 1])
return np.array(picked_names)
def gen_char(data: np.ndarray, char = ''):
first_column: np.ndarray = data[:, 0].astype(str)
# Initialize an empty list for the new first column
new_data = []
unique_values = set()
for s in first_column:
index = random.randint(0,len(s)-1)
is_scammer = data[np.where(first_column == s),1][0][0]
new_value = s[:index] + char + s[index:]
if new_value not in unique_values:
unique_values.add(new_value)
new_data.append([new_value, is_scammer])
return np.array(new_data)
def longest_string_length(arr):
if arr.size == 0:
return 0 # Handle empty array case
first_column = arr[:, 0] # Extract the first column
max_length = len(max(first_column, key=len)) # Find the length of the longest string
return max_length
if __name__ == "__main__":
arr = np.array(load_csv("data_for_gen.csv"))
raw_data_len = len(arr)
raw_data_scam = np.count_nonzero(arr == "1")
users = arr[arr[:,1] == '0']
users = np.concatenate((users, DataGen.gen_char(users, char='_')))
#arr = np.concatenate((arr, DataGen.gen_char(users, char='_')))
arr = np.concatenate((arr, DataGen.randomize_numbers(users)), axis=0)
arr = np.concatenate((arr, DataGen.randomize_numbers(arr)), axis=0)
arr = np.concatenate((arr, DataGen.name_gen("names.csv", 200, gen_trailing_numbers=True)))
arr = np.concatenate((arr, DataGen.full_name_gen("names.csv", "surnames.csv", 200)))
arr = np.concatenate((arr, DataGen.full_name_gen("names.csv", "surnames.csv", 200, gen_trailing_numbers=True)))
#arr = np.concatenate((arr, DataGen.full_name_gen("names.csv", "surnames.csv", 1000, gen_trailing_numbers=True, insert = '_')))
arr = np.concatenate((arr, DataGen.randomize_numbers(arr)), axis=0)
arr = np.concatenate((arr, DataGen.randomize_numbers(arr)), axis=0)
arr = np.concatenate((arr, DataGen.randomize_numbers(arr)), axis=0)
arr = np.concatenate((arr, DataGen.gen_gfx_prefix(arr)),axis=0)
arr = np.concatenate((arr, DataGen.gen_gfx_suffix(arr)), axis=0)
arr = np.concatenate((arr, [["abcdefghijklm", "0"],["nopqrtsuvwxyz", "0"], ["0987654321", "0"]]), axis=0)
print(
f'''{math.floor((np.count_nonzero(arr == "1")/len(arr))*100)}% of the Dataset are scammers
Of that {raw_data_scam} were not generated
The item generation equates to {len(arr)-raw_data_len} items
Which brings the total data set to a lenght of {len(arr)}'''.replace(" ", ""))
print(f"Please use {DataGen.longest_string_length(arr)} as the Squence length")
save_csv("generated_data.csv", arr)