-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic.py
420 lines (264 loc) · 13.2 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
import pandas as pd
import numpy as np
import missingno
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#Downloading Datasets
! kaggle competitions download -c titanic
#Loading in data
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
sample_sub = pd.read_csv("/content/gender_submission.csv")
#Reading those files to get an idea of structure
train.head()
test.head()
#This is how the submission should look like
sample_sub.head()
print('train shape:', train.shape)
print('test shape:', test.shape)
print('sample submission shape:', sample_sub.shape)
#Checking non_null and data types of dataset train
train.info()
#Checking non_null and data types of dataset test
test.info()
#Checking missing data in trainig set by columns
train.isnull().sum().sort_values(ascending = False)
missingno.matrix(train)
#Checking missing data in test set by columns
test.isnull().sum().sort_values(ascending = False)
missingno.matrix(test)
#Will go through some text variables
#Survival chance by 'sex'
train[['Sex', 'Survived']].groupby('Sex', as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#Female people are more likely to survive
#Survival chance by 'Pclass' (Passenger Class)
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#1st class people are more likely to survive or they were prioritized while evacuating
#Survival chance by 'Pclass' (Passenger Class) and 'Sex'
graph = sns.factorplot(x = 'Pclass', y = 'Survived', hue = 'Sex', data = train, kind = 'bar')
graph.despine(left = True)
plt.ylabel('Survival chance')
plt.title('Survival chance by Sex and Passenger Class')
#Survival chance by 'Embarked'
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#People embarked at location "C" are more likely to survive
#Will get a chart by passenger class and emabarked location
sns.factorplot('Pclass', col = 'Embarked', data = train, kind = 'count')
#Most of the people are embarked at location 'S' and are 3rd class
#Now will go through some numeric variables
#Detect and remove outliers in numerical variables
def detect_outliers(df, n, features):
""""
This function will loop through a list of features and detect outliers in each one of those features. In each
loop, a data point is deemed an outlier if it is less than the first quartile minus the outlier step or exceeds
third quartile plus the outlier step. The outlier step is defined as 1.5 times the interquartile range. Once the
outliers have been determined for one feature, their indices will be stored in a list before proceeding to the next
feature and the process repeats until the very last feature is completed. Finally, using the list with outlier
indices, we will count the frequencies of the index numbers and return them if their frequency exceeds n times.
"""
outlier_indices = []
for col in features:
Q1 = np.percentile(df[col], 25)
Q3 = np.percentile(df[col], 75)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
outlier_indices.extend(outlier_list_col)
outlier_indices = Counter(outlier_indices)
multiple_outliers = list(key for key, value in outlier_indices.items() if value > n)
return multiple_outliers
outliers_to_drop = detect_outliers(train, 2, ['Age', 'SibSp', 'Parch', 'Fare'])
print("drop these {} indices: ".format(len(outliers_to_drop)), outliers_to_drop)
#Those indices in numeric values
train.loc[outliers_to_drop, :]
#Drop those indices from the train set
train = train.drop(outliers_to_drop, axis = 0).reset_index(drop = True)
#Now will get a heat map with numeric variables
sns.heatmap(train[['Survived', 'SibSp', 'Parch', 'Age', 'Fare']].corr(), annot = True, fmt = '.2f', cmap = 'coolwarm')
#And it seems that 'Fare' is the only variable that affects chances of survivel
#Survival chance by 'SibSp' (Sibling and Spouse)
train[['SibSp', 'Survived']].groupby('SibSp', as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#People with 1 sibling/spouse are more likely to survive
#Survival chance by 'Parch' (Parent and Child)
train[['Parch', 'Survived']].groupby('Parch', as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#People with 3 children/Parents are more likely to survive
#Null values in age
train['Age'].isnull().sum()
#Survival chance by age
sns.kdeplot(train['Age'][train['Survived'] == 0], label = 'Did not survive')
sns.kdeplot(train['Age'][train['Survived'] == 1], label = 'Survived')
plt.xlabel('Age')
plt.title('Passenger Age Distribution by Survival')
#Null values in 'Fare'
train['Fare'].isnull().sum()
###################### Data Preprocessing ##############################
#Drop the 'Ticket' and 'Cabin' from datasets
#dropping the ticket column since we have the 'Fare' column
#dropping the cabin column since it containes a lot of null values
train = train.drop(['Ticket', 'Cabin'], axis = 1)
test = test.drop(['Ticket', 'Cabin'], axis = 1)
#Now will see the missing(null) values in the training set
train.isnull().sum().sort_values(ascending = False)
#Okay will fill in 'Embarked' with the most frequent value in the set
freq_e = train['Embarked'].dropna().mode()[0]
train['Embarked'].fillna(freq_e, inplace = True)
#Now will see the missing(null) values in the test set
test.isnull().sum().sort_values(ascending = False)
#Okay will fill in 'Fare' with the median value of 'Fare' in the set
median = test['Fare'].dropna().median()
test['Fare'].fillna(median, inplace = True)
#Let's combine both train and test sets
combination = pd.concat([train, test], axis = 0).reset_index(drop = True)
combination.head()
#Missing values in combined dataset
combination.isnull().sum().sort_values(ascending = False)
#We can ignore the 'Survived' missing values since it's from the test set
#Converting the 'Sex' column into numerical values
combination['Sex'] = combination['Sex'].map({'male': 0, 'female': 1})
#Time to fill in the age column
age_null_counts = list(combination[combination['Age'].isnull()].index)
len(age_null_counts)
'''
This loop wil go through each age in the list(age_null_counts) and it will locate the rows that has the same
'SibSp', 'Parch' and 'PClass' values after it will fill the missing ages with the median of those rows, if no
rows found it will fill the age with the median of 'Age' column
'''
for i in age_null_counts:
median_age = combination['Age'].median()
predicted_age = combination['Age'][(combination['SibSp'] == combination.iloc[i]['SibSp'])&
(combination['Parch'] == combination.iloc[i]['Parch'])&
(combination['Pclass'] == combination.iloc[i]['Pclass'])].median()
if np.isnan(predicted_age):
combination['Age'].iloc[i] = median_age
else:
combination['Age'].iloc[i] = predicted_age
#Check if it worked
combination['Age'].isnull().sum()
combination.head()
#Time to tranform those data into numeric values
#Plotting 'Fare' distribution
sns.distplot(combination['Fare'], label = 'Skewness: %.2f'%(combination['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Fare Distribiution')
#will reduce skewness
combination['Fare'] = combination['Fare'].map(lambda x : np.log(x) if x > 0 else 0)
#Plotting 'Fare' distribution after
sns.distplot(combination['Fare'], label = 'Skewness: %.2f'%(combination['Fare'].skew()))
plt.legend(loc = 'best')
plt.title('Fare Distribiution After Log')
combination.head()
#Will get 'Title' separated from 'Name'
combination['Title'] = [name.split(',')[1].split('.')[0].strip() for name in combination['Name']]
combination[['Name', 'Title']].head()
combination['Title'].value_counts()
#Sorting those titles
combination['Title'] = combination['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Lady', 'Jonkheer', 'Don', 'Capt',
'the Countess', 'Sir', 'Dona'],'Rare')
combination['Title'] = combination['Title'].replace(['Mlle', 'Ms'], 'Miss')
combination['Title'] = combination['Title'].replace('Mme', 'Mrs')
sns.countplot(combination['Title'])
#Survival chance by 'Title'
combination[['Title', 'Survived']].groupby(['Title'], as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#Now we don't need 'Name' column anymore, will drop it
combination = combination.drop('Name', axis=1)
combination.head()
#Will calculate family size of passenger using 'SibSp', 'Parch'
combination['FamilySize'] = combination['SibSp'] + combination['Parch'] + 1
combination[['SibSp', 'Parch', 'FamilySize']].head()
#Chances of survival by 'FamilySize'
combination[['FamilySize', 'Survived']].groupby('FamilySize', as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#Will create a new column to define if the passenger is alone or not
combination['IsAlone'] = 0
combination.loc[combination['FamilySize'] == 1, 'IsAlone'] = 1
combination.head()
#Chances of survival by 'IsAlone'
combination[['IsAlone', 'Survived']].groupby('IsAlone', as_index = False).mean().sort_values(by = 'Survived', ascending = False)
#Time to drop 'FamilSize', 'SibSp' and 'Parch' from dataframe
combination = combination.drop(['SibSp', 'Parch', 'FamilySize'], axis = 1)
combination.head()
#Will create a new column 'AgeBand' --> get the age and group them into 5 groups
combination['AgeBand'] = pd.cut(combination['Age'], 5)
combination.head()
#Chances of survival by 'AgeBand'
combination[['AgeBand', 'Survived']].groupby('AgeBand', as_index=False).mean().sort_values(by = 'AgeBand')
#will convert the 'Age' column to groups from 0-4 using the values from 'AgeBand'
combination.loc[combination['Age'] <= 16.136, 'Age'] = 0
combination.loc[(combination['Age'] > 16.136) & (combination['Age'] <= 32.102), 'Age'] = 1
combination.loc[(combination['Age'] > 32.102) & (combination['Age'] <= 48.068), 'Age'] = 2
combination.loc[(combination['Age'] > 48.068) & (combination['Age'] <= 64.034), 'Age'] = 3
combination.loc[(combination['Age'] > 64.034), 'Age'] = 4
combination.head()
#Will drop 'AgeBand'
combination = combination.drop('AgeBand', axis = 1)
#Will convert data type of 'Age' into an integer
combination['Age'] = combination['Age'].astype('int32')
combination['Age'].dtype
#Will create a new column by multiplying 'Age' and 'Pclass'
combination['Age*Class'] = combination['Age'] * combination['Pclass']
combination.head()
#Converting 'Title', 'Embarked' and 'Fare' into ordinal values
#First will encode 'Title' and 'Embarked'
combination = pd.get_dummies(combination, columns = ['Title'])
combination = pd.get_dummies(combination, columns = ['Embarked'], prefix = 'Em')
combination.head()
#Now will divide 'Fare' into 4 groups like what we did to the age
combination['FareBand'] = pd.cut(combination['Fare'], 4)
#Survivel chances of 'FareBand'
combination[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by = 'FareBand')
#will convert the 'Fare' column to groups from 0-3 using the values from 'FareBand'
combination.loc[combination['Fare'] <= 1.56, 'Fare'] = 0
combination.loc[(combination['Fare'] > 1.56) & (combination['Fare'] <= 3.119), 'Fare'] = 1
combination.loc[(combination['Fare'] > 3.119) & (combination['Fare'] <= 4.679), 'Fare'] = 2
combination.loc[combination['Fare'] > 4.679, 'Fare'] = 3
combination.head()
#Will drop 'FareBand' column
combination = combination.drop('FareBand', axis = 1)
#Converting 'Fare' into integer
combination['Fare'] = combination['Fare'].astype('int32')
combination.head()
#Time to separate train and test from combination
train = combination[:len(train)]
test = combination[len(train):]
train.head()
#will drop 'PassngerId' from training set
train = train.drop('PassengerId', axis = 1)
train.head()
#Converting survived to integer in train set
train['Survived'] = train['Survived'].astype('int32')
train.head()
#Time for test set
test.head()
#Drop the 'Survived' from test set
test = test.drop('Survived', axis = 1)
test.head()
#Ah Finally! time to model
#Will split train data
X_train = train.drop('Survived', axis = 1)
Y_train = train['Survived']
X_test = test.drop('PassengerId', axis = 1).copy()
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
#Will go with Random Forest
rd_tree = RandomForestClassifier(n_estimators=100)
rd_tree.fit(X_train, Y_train)
Y_pred = rd_tree.predict(X_test)
acc_de_tree = round(rd_tree.score(X_train, Y_train) * 100, 2)
print(acc_de_tree)
#Tuning
cv_results = cross_val_score(RandomForestClassifier(), X_train, Y_train, scoring = 'accuracy', cv = 10)
print(cv_results.mean())
#Preparing for Kaggle submission
print(len(Y_pred))
sample_sub.head()
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': Y_pred})
submission.head()
submission.shape
#Save the CSV file
submission.to_csv('/content/submission.csv', index = False)