Skip to content

Commit

Permalink
Assignment_3
Browse files Browse the repository at this point in the history
  • Loading branch information
tailorfenil committed Jun 4, 2016
1 parent ed9bcba commit 89db232
Show file tree
Hide file tree
Showing 4 changed files with 24,040 additions and 0 deletions.
Binary file added NYU_DS_Assignment3/Assignment3_final2.pdf
Binary file not shown.
278 changes: 278 additions & 0 deletions NYU_DS_Assignment3/Assignment_3_final.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
---
title: "Assignment_3_final"
author: "Fenil"
date: "March 22, 2016"
output: pdf_document
---

#Q-1A)

#importing the dataset and skipping for first four lines as they are not data
rs_man<-read.table(file.choose(),header = T,skip =4,sep=",")
View(rs_man)
rs_mandf<-data.frame(rs_man)

#changes the date format to month/day/year

rs_mandf[,21]<-as.Date(rs_mandf[,21],"%m/%d/%y")

#remove the 7th and 1st column from the data set as they are not significant for us
rs_mandf_modify<-rs_mandf[,-7]
rs_mandf_modify<-rs_mandf_modify[,-1]

#if value of the LAND.SQUARE.FEET==0 or GROSS.SQUARE.FEET==0 then we are setting it to null so that we can use rm.na=T in #methods

rs_mandf_modify$LAND.SQUARE.FEET[rs_mandf_modify$LAND.SQUARE.FEET==0] <- NA
rs_mandf_modify$GROSS.SQUARE.FEET[rs_mandf_modify$GROSS.SQUARE.FEET==0] <- NA
#install.packages("stringr", dependencies=TRUE)
library(stringr)

#if apartment value is "" then we replace it to NA

rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)==""] <- NA

#rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="" || #str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="-"] <- NA

#we are removing , from the SALE.PRICE so that we can convert it into integer

rs_mandf_modify[,18]<-as.numeric(gsub(",","",rs_mandf_modify[,18]))

#plot(rs_mandf_modify[,19])--useless code

#we are removing , from the GROSS.SQUARE.FEET so that we can convert it into integer

rs_mandf_modify[,14]<-as.numeric(gsub(",","",rs_mandf_modify[,14]))

#plot(rs_mandf_modify[,14])---useless code

#we are removing , from the LAND.SQAURE.FEET so that we can convert it into integer
rs_mandf_modify[,13]<-as.numeric(gsub(",","",rs_mandf_modify[,13]))
#plot(rs_mandf_modify[,15])---useless code

#Q-1B)
#Run 1-b without running 1-a to get required output
#importing the dataset and skipping for first four lines as they are not data
rs_man<-read.table(file.choose(),header = T,skip =4,sep=",")
View(rs_man)
rs_mandf<-data.frame(rs_man)

#changes the date format to month/day/year

rs_mandf[,21]<-as.Date(rs_mandf[,21],"%m/%d/%y")

#remove the 7th and 1st column from the data set as they are not significant for us
rs_mandf_modify<-rs_mandf[,-7]
rs_mandf_modify<-rs_mandf_modify[,-1]

#if value of the LAND.SQUARE.FEET==0 or GROSS.SQUARE.FEET==0 then we are setting it to null so that we can use rm.na=T in #methods

rs_mandf_modify$LAND.SQUARE.FEET[rs_mandf_modify$LAND.SQUARE.FEET==0] <- NA
rs_mandf_modify$GROSS.SQUARE.FEET[rs_mandf_modify$GROSS.SQUARE.FEET==0] <- NA
#install.packages("stringr", dependencies=TRUE)
library(stringr)

#if apartment value is "" then we replace it to NA

rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)==""] <- NA

#rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="" || #str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="-"] <- NA

#we are removing , from the SALE.PRICE so that we can convert it into integer

rs_mandf_modify[,18]<-as.numeric(gsub(",","",rs_mandf_modify[,18]))

#plot(rs_mandf_modify[,19])--useless code

#we are removing , from the GROSS.SQUARE.FEET so that we can convert it into integer

rs_mandf_modify[,14]<-as.numeric(gsub(",","",rs_mandf_modify[,14]))

#plot(rs_mandf_modify[,14])---useless code

#we are removing , from the LAND.SQAURE.FEET so that we can convert it into integer
rs_mandf_modify[,13]<-as.numeric(gsub(",","",rs_mandf_modify[,13]))
#plot(rs_mandf_modify[,15])---useless code

#we are taking uniq counts of the neighborhood

plot(rs_mandf_modify$NEIGHBORHOOD,rs_mandf_modify$SALE.PRICE)
uniq<-unique(rs_mandf_modify$NEIGHBORHOOD)
length(uniq)

rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]


neighbourhoods_sd <- data.frame(neighbourhoods = unique(rs_mandf_modify.sale$NEIGHBORHOOD))

for (i in 1:38)
{
neighbourhoods_sd$sd_sale_price[i] <- sd(rs_mandf_modify.sale$SALE.PRICE[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
neighbourhoods_sd$sd_Commercial_units[i] <- sd(rs_mandf_modify.sale$COMMERCIAL.UNITS[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
neighbourhoods_sd$sd_Residential_units[i] <- sd(rs_mandf_modify.sale$RESIDENTIAL.UNITS[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
neighbourhoods_sd$sd_Land_Square_feet[i] <- sd(rs_mandf_modify.sale$LAND.SQUARE.FEET[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
neighbourhoods_sd$sd_built_square_feet[i] <- sd(rs_mandf_modify.sale$GROSS.SQUARE.FEET[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)

}
View(neighbourhoods_sd)

#rs_mandf_modify$SALE.PRICE[rs_mandf_modify$NEIGHBORHOOD==uniq[1]]

#we are plotting SALE.PRICE for all the neighbourhoods and their counts
for(i in 1:length(neighbourhoods_sd)){
plot(rs_mandf_modify$SALE.PRICE[rs_mandf_modify$NEIGHBORHOOD==uniq[i]],col=i,xlab=uniq[i],ylab="SALE.PRICE",pch=20)
}

#we are polting graph of SALE.PRICE vS SALE.DATE
plot(rs_mandf_modify$SALE.DATE,rs_mandf_modify$SALE.PRICE,ylab="SALE.PRICE",xlab="SALE.DATE",pch=20)

#Q-2a)

rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]

model<-lm((SALE.PRICE)~(GROSS.SQUARE.FEET)+(LAND.SQUARE.FEET)+factor(NEIGHBORHOOD)*factor(BUILDING.CLASS.CATEGORY),data=rs_mandf_modify.sale)
summary(model)


#Q-2b)
rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]


model<-lm((SALE.PRICE)~(GROSS.SQUARE.FEET)+(LAND.SQUARE.FEET)+factor(NEIGHBORHOOD)*factor(BUILDING.CLASS.CATEGORY),data=rs_mandf_modify.sale)
summary(model)
plot(resid(model)~fitted(model))

#Q-2c)
#Predicting neighborhood using following column

rs_mandf_frame1<-data.frame(rs_mandf_modify.sale[,9])
#View(rs_mandf_frame1)
#rs_mandf_frame1<-rs_mandf_frame1[sample(nrow(rs_mandf_frame1)),]

#created response variable dataframecoll using NEIGHBORHOOD

dataframecoll=data.frame(rs_mandf_modify.sale$NEIGHBORHOOD)
#View(rs_mandf_frame1)

#Created 10 equally size folds
folds <- cut(seq(1,nrow(rs_mandf_frame1)),breaks=10,labels=FALSE)

#Created vector Accaracyvector for storing accuracy and icoll to store iteration var i
Accuracyvector<-integer()
icoll<-integer()

#Performed 10 fold cross validation

for(i in 1:10){
icoll<-c(icoll,i)
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- rs_mandf_frame1[testIndexes, ]
#print(length(testData$SALE.PRICE))
trainData <- rs_mandf_frame1[-testIndexes, ]
#print(length(trainData$SALE.PRICE))
trainres<-(dataframecoll[-testIndexes,])
testres<-(dataframecoll[testIndexes,])
library(e1071)
library(class)
library(caret)
knn_req<-knn(train=data.frame(trainData),test=data.frame(testData),cl=trainres,k=3)
#print(knn_req) Here I have created confusion matrix that give prediction for all the folds
GetAccuracy<-confusionMatrix(testres,knn_req)
#print(confusionMatrix(testres,knn_req))
#print(GetAccuracy$overall[1])
Accuracyvector<-c(Accuracyvector,GetAccuracy$overall[1])
}

#ploting acccuracy versus fold count

plot(icoll,Accuracyvector,xlab="Fold Counter",ylab="Accuracy")
sum=0
for(acc in Accuracyvector)
{
sum=sum+acc
}

#printed the average accuracy of all the 10 folds

print(sum/length(Accuracyvector))

#Q-2d

#Predicting neighborhood using following column
rs_mandf_frame1<-data.frame(rs_mandf_modify.sale[,9])
#View(rs_mandf_frame1)
#rs_mandf_frame1<-rs_mandf_frame1[sample(nrow(rs_mandf_frame1)),]

#created response variable dataframecoll using NEIGHBORHOOD

dataframecoll=data.frame(rs_mandf_modify.sale$NEIGHBORHOOD)
#View(rs_mandf_frame1)

#Created 10 equally size folds
folds <- cut(seq(1,nrow(rs_mandf_frame1)),breaks=10,labels=FALSE)

#Created vector Accaracyvector for storing accuracy and icoll to store iteration var i
Accuracyvector<-integer()
icoll<-integer()

#Performed 10 fold cross validation

for(i in 1:10){
icoll<-c(icoll,i)
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- rs_mandf_frame1[testIndexes, ]
#print(length(testData$SALE.PRICE))
trainData <- rs_mandf_frame1[-testIndexes, ]
#print(length(trainData$SALE.PRICE))
trainres<-(dataframecoll[-testIndexes,])
testres<-(dataframecoll[testIndexes,])
knn_req<-knn(train=data.frame(trainData),test=data.frame(testData),cl=trainres,k=3)
#print(knn_req) Here I have created confusion matrix that give prediction for all the folds
GetAccuracy<-confusionMatrix(testres,knn_req)
#print(confusionMatrix(testres,knn_req))
#print(GetAccuracy$overall[1])
Accuracyvector<-c(Accuracyvector,GetAccuracy$overall[1])
}

#ploting acccuracy versus fold count

plot(icoll,Accuracyvector,xlab="Fold Counter",ylab="Accuracy")
sum=0
for(acc in Accuracyvector)
{
sum=sum+acc
}

#printed the average accuracy of all the 10 folds

print(sum/length(Accuracyvector))

#Q-2E)
```
As we can see that our overall average accuracy for 10 fold cross validation is around 46% which is not significant. So our try should be to improve the accuracy of prediction by creating the folds so that there are more similar and significant values in the same fold .Thus, model can learn from training data set very efficiently and can predict the response variable for test data with more accuracy
Also from the graph we can see that for the first 5 folds the accuracy is continuously increasing for k=3 and then after the accuracy is continuously decreasing.From this observation we can say that there are more likely and similar values for the predictors for the 5th and 6th fold than any other fold.We can also say that predictors values are not that good and repetitive in sense for 1st ,2nd and 10th observation for the purpose of learning model.
```

#Q-3A)
#we perform scaling on the variables to perform PCA
scaled_sales <- data.frame(scale(rs_mandf_modify.sale[,c(13,14,18)], scale = TRUE))
# we perform PCA analyses on the data
PCA = prcomp(~GROSS.SQUARE.FEET+LAND.SQUARE.FEET+SALE.PRICE,data=scaled_sales,na.action =na.omit,scale=T)
str(PCA)
summary(PCA)
predict(PCA)
plot(PCA,type="l")

#Q-3B)

# first we plot the first two vectors
scr=as.matrix(scaled_sales[,c(1:3)])%*%PCA$rot[,1:2]
plot(scr,col="blue",ylab="PC2 and PC3")
#now we plot the first and third vector
scr_1=as.matrix(scaled_sales[,c(1:3)])%*%PCA$rot[,c(1,3)]
points(scr_1,col="red")

#we can visualize the 3 vectors using an inbuilt function of r biplot

biplot(PCA)
Binary file added NYU_DS_Assignment3/CS3943-9223_Assignment3.pdf
Binary file not shown.
Loading

0 comments on commit 89db232

Please sign in to comment.