-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTitanic_random_forest_classification.py
90 lines (62 loc) · 2.57 KB
/
Titanic_random_forest_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# create a Pandas data frame from the train set.
train = pd.read_csv('train.csv')
# # Data preprocessing
train = train.drop(['Cabin', 'Name', 'Ticket'], axis=1)
def impute_age(cols):
"""
cols: DataFrame object
Returns mean age for each Pclass if the value is NaN, else returns Age.
"""
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return np.mean(train[train['Pclass'] == 1]['Age'])
elif Pclass == 2:
return np.mean(train[train['Pclass'] == 2]['Age'])
else:
return np.mean(train[train['Pclass'] == 3]['Age'])
else:
return Age
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)
# Encoding categorical data and updating the dataframe
sex = pd.get_dummies(train['Sex'], drop_first=True) # Creating a new DataFrame with dummy variables for the sex column
emb = pd.get_dummies(train['Embarked'],
drop_first=True) # Creating a new DataFrame with dummy variables for the embarked column
train = pd.concat([train, sex, emb], axis=1) # Concatenating the datasets
train = train.drop(['Sex', 'Embarked'], axis=1) # Dropping remaining sex and embarked columns
# # Classification with random forest classifier
from sklearn.model_selection import train_test_split
X = train.drop('Survived', axis=1)
y = train['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200, random_state=11)
rfc.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix, classification_report
pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, pred_rfc))
print(classification_report(y_test, pred_rfc))
errorRateDepth = []
depthList = list(range(1, 10))
for i in depthList:
rfcOpt = RandomForestClassifier(n_estimators=200, max_depth=i, random_state=11)
rfcOpt.fit(X_train, y_train)
predOpt = rfcOpt.predict(X_test)
errorRateDepth.append(np.mean(y_test != predOpt))
plt.figure(figsize=(12, 6))
plt.plot(depthList, errorRateDepth)
plt.xlabel('Max Depth')
plt.ylabel('Error rate')
rfc_opt = RandomForestClassifier(n_estimators=200, max_depth=3)
rfc_opt.fit(X_train, y_train)
pred_rfc_opt = rfc_opt.predict(X_test)
print(confusion_matrix(y_test, pred_rfc_opt))
print(classification_report(y_test, pred_rfc_opt))
# This time we could reach a slightly better accuracy of 0.85.