Assignment_3

prashant182 · Jun 4, 2016 · 89db232 · 89db232
1 parent ed9bcba
commit 89db232
Show file tree

Hide file tree

Showing 4 changed files with 24,040 additions and 0 deletions.
diff --git a/NYU_DS_Assignment3/Assignment3_final2.pdf b/NYU_DS_Assignment3/Assignment3_final2.pdf
diff --git a/NYU_DS_Assignment3/Assignment_3_final.Rmd b/NYU_DS_Assignment3/Assignment_3_final.Rmd
@@ -0,0 +1,278 @@
+---
+title: "Assignment_3_final"
+author: "Fenil"
+date: "March 22, 2016"
+output: pdf_document
+---
+
+#Q-1A)
+
+#importing the dataset and skipping for first four lines as they are not data
+rs_man<-read.table(file.choose(),header = T,skip =4,sep=",")
+View(rs_man)
+rs_mandf<-data.frame(rs_man)
+
+#changes the date format to month/day/year
+
+rs_mandf[,21]<-as.Date(rs_mandf[,21],"%m/%d/%y")
+
+#remove the 7th and 1st column from the data set as they are not significant for us
+rs_mandf_modify<-rs_mandf[,-7]
+rs_mandf_modify<-rs_mandf_modify[,-1]
+
+#if value of the LAND.SQUARE.FEET==0 or GROSS.SQUARE.FEET==0 then we are setting it to null so that we can use rm.na=T in #methods
+
+rs_mandf_modify$LAND.SQUARE.FEET[rs_mandf_modify$LAND.SQUARE.FEET==0] <- NA
+rs_mandf_modify$GROSS.SQUARE.FEET[rs_mandf_modify$GROSS.SQUARE.FEET==0] <- NA
+#install.packages("stringr", dependencies=TRUE)
+library(stringr)
+
+#if apartment value is "" then we replace it to NA
+
+rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)==""] <- NA
+
+#rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="" || #str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="-"] <- NA
+
+#we are removing , from the SALE.PRICE so that we can convert it into integer
+
+rs_mandf_modify[,18]<-as.numeric(gsub(",","",rs_mandf_modify[,18]))
+
+#plot(rs_mandf_modify[,19])--useless code
+
+#we are removing , from the GROSS.SQUARE.FEET so that we can convert it into integer
+
+rs_mandf_modify[,14]<-as.numeric(gsub(",","",rs_mandf_modify[,14]))
+
+#plot(rs_mandf_modify[,14])---useless code
+
+#we are removing , from the LAND.SQAURE.FEET so that we can convert it into integer
+rs_mandf_modify[,13]<-as.numeric(gsub(",","",rs_mandf_modify[,13]))
+#plot(rs_mandf_modify[,15])---useless code
+
+#Q-1B)
+#Run 1-b without running 1-a to get required output
+#importing the dataset and skipping for first four lines as they are not data
+rs_man<-read.table(file.choose(),header = T,skip =4,sep=",")
+View(rs_man)
+rs_mandf<-data.frame(rs_man)
+
+#changes the date format to month/day/year
+
+rs_mandf[,21]<-as.Date(rs_mandf[,21],"%m/%d/%y")
+
+#remove the 7th and 1st column from the data set as they are not significant for us
+rs_mandf_modify<-rs_mandf[,-7]
+rs_mandf_modify<-rs_mandf_modify[,-1]
+
+#if value of the LAND.SQUARE.FEET==0 or GROSS.SQUARE.FEET==0 then we are setting it to null so that we can use rm.na=T in #methods
+
+rs_mandf_modify$LAND.SQUARE.FEET[rs_mandf_modify$LAND.SQUARE.FEET==0] <- NA
+rs_mandf_modify$GROSS.SQUARE.FEET[rs_mandf_modify$GROSS.SQUARE.FEET==0] <- NA
+#install.packages("stringr", dependencies=TRUE)
+library(stringr)
+
+#if apartment value is "" then we replace it to NA
+
+rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)==""] <- NA
+
+#rs_mandf_modify$APARTMENT.NUMBER[str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="" || #str_trim(rs_mandf_modify$APARTMENT.NUMBER)=="-"] <- NA
+
+#we are removing , from the SALE.PRICE so that we can convert it into integer
+
+rs_mandf_modify[,18]<-as.numeric(gsub(",","",rs_mandf_modify[,18]))
+
+#plot(rs_mandf_modify[,19])--useless code
+
+#we are removing , from the GROSS.SQUARE.FEET so that we can convert it into integer
+
+rs_mandf_modify[,14]<-as.numeric(gsub(",","",rs_mandf_modify[,14]))
+
+#plot(rs_mandf_modify[,14])---useless code
+
+#we are removing , from the LAND.SQAURE.FEET so that we can convert it into integer
+rs_mandf_modify[,13]<-as.numeric(gsub(",","",rs_mandf_modify[,13]))
+#plot(rs_mandf_modify[,15])---useless code
+
+#we are taking uniq counts of the neighborhood
+
+plot(rs_mandf_modify$NEIGHBORHOOD,rs_mandf_modify$SALE.PRICE)
+uniq<-unique(rs_mandf_modify$NEIGHBORHOOD)
+length(uniq)
+
+rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]
+
+
+neighbourhoods_sd <- data.frame(neighbourhoods = unique(rs_mandf_modify.sale$NEIGHBORHOOD))
+
+for (i in 1:38)
+{
+  neighbourhoods_sd$sd_sale_price[i] <- sd(rs_mandf_modify.sale$SALE.PRICE[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
+  neighbourhoods_sd$sd_Commercial_units[i] <- sd(rs_mandf_modify.sale$COMMERCIAL.UNITS[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
+  neighbourhoods_sd$sd_Residential_units[i] <- sd(rs_mandf_modify.sale$RESIDENTIAL.UNITS[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
+  neighbourhoods_sd$sd_Land_Square_feet[i] <- sd(rs_mandf_modify.sale$LAND.SQUARE.FEET[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
+  neighbourhoods_sd$sd_built_square_feet[i] <- sd(rs_mandf_modify.sale$GROSS.SQUARE.FEET[rs_mandf_modify.sale$NEIGHBORHOOD == neighbourhoods_sd$neighbourhoods[i]],na.rm = TRUE)
+
+}
+View(neighbourhoods_sd)
+
+#rs_mandf_modify$SALE.PRICE[rs_mandf_modify$NEIGHBORHOOD==uniq[1]]
+
+#we are plotting SALE.PRICE for all the neighbourhoods and their counts
+for(i in 1:length(neighbourhoods_sd)){
+plot(rs_mandf_modify$SALE.PRICE[rs_mandf_modify$NEIGHBORHOOD==uniq[i]],col=i,xlab=uniq[i],ylab="SALE.PRICE",pch=20)
+}
+
+#we are polting graph of SALE.PRICE vS SALE.DATE
+plot(rs_mandf_modify$SALE.DATE,rs_mandf_modify$SALE.PRICE,ylab="SALE.PRICE",xlab="SALE.DATE",pch=20)
+
+#Q-2a)
+
+rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]
+
+model<-lm((SALE.PRICE)~(GROSS.SQUARE.FEET)+(LAND.SQUARE.FEET)+factor(NEIGHBORHOOD)*factor(BUILDING.CLASS.CATEGORY),data=rs_mandf_modify.sale)
+summary(model)
+
+
+#Q-2b)
+rs_mandf_modify.sale<-rs_mandf_modify[which(rs_mandf_modify$GROSS.SQUARE.FEET>0 & rs_mandf_modify$LAND.SQUARE.FEET>0 & rs_mandf_modify$SALE.PRICE>0),]
+
+
+model<-lm((SALE.PRICE)~(GROSS.SQUARE.FEET)+(LAND.SQUARE.FEET)+factor(NEIGHBORHOOD)*factor(BUILDING.CLASS.CATEGORY),data=rs_mandf_modify.sale)
+summary(model)
+plot(resid(model)~fitted(model))
+
+#Q-2c)
+#Predicting neighborhood using following column
+
+rs_mandf_frame1<-data.frame(rs_mandf_modify.sale[,9])
+#View(rs_mandf_frame1)
+#rs_mandf_frame1<-rs_mandf_frame1[sample(nrow(rs_mandf_frame1)),]
+
+#created response variable dataframecoll using NEIGHBORHOOD
+
+dataframecoll=data.frame(rs_mandf_modify.sale$NEIGHBORHOOD) 
+#View(rs_mandf_frame1)
+
+#Created 10 equally size folds
+folds <- cut(seq(1,nrow(rs_mandf_frame1)),breaks=10,labels=FALSE)
+
+#Created vector Accaracyvector for storing accuracy and icoll to store iteration var i
+Accuracyvector<-integer()
+icoll<-integer()
+
+#Performed 10 fold cross validation
+
+for(i in 1:10){
+    icoll<-c(icoll,i)
+#Segement your data by fold using the which() function 
+    testIndexes <- which(folds==i,arr.ind=TRUE)
+    testData <- rs_mandf_frame1[testIndexes, ]
+#print(length(testData$SALE.PRICE))
+    trainData <- rs_mandf_frame1[-testIndexes, ]
+#print(length(trainData$SALE.PRICE))
+    trainres<-(dataframecoll[-testIndexes,])
+    testres<-(dataframecoll[testIndexes,])
+    library(e1071)
+    library(class)
+    library(caret)
+    knn_req<-knn(train=data.frame(trainData),test=data.frame(testData),cl=trainres,k=3)
+#print(knn_req)  Here I have created confusion matrix that give prediction for all the folds
+    GetAccuracy<-confusionMatrix(testres,knn_req)
+#print(confusionMatrix(testres,knn_req))
+#print(GetAccuracy$overall[1])
+    Accuracyvector<-c(Accuracyvector,GetAccuracy$overall[1])
+}
+
+#ploting acccuracy versus fold count
+
+plot(icoll,Accuracyvector,xlab="Fold Counter",ylab="Accuracy")
+sum=0
+for(acc in Accuracyvector)
+{
+  sum=sum+acc
+}
+
+#printed the average accuracy of all the 10 folds
+
+print(sum/length(Accuracyvector))
+
+#Q-2d
+
+#Predicting neighborhood using following column
+rs_mandf_frame1<-data.frame(rs_mandf_modify.sale[,9])
+#View(rs_mandf_frame1)
+#rs_mandf_frame1<-rs_mandf_frame1[sample(nrow(rs_mandf_frame1)),]
+
+#created response variable dataframecoll using NEIGHBORHOOD
+
+dataframecoll=data.frame(rs_mandf_modify.sale$NEIGHBORHOOD) 
+#View(rs_mandf_frame1)
+
+#Created 10 equally size folds
+folds <- cut(seq(1,nrow(rs_mandf_frame1)),breaks=10,labels=FALSE)
+
+#Created vector Accaracyvector for storing accuracy and icoll to store iteration var i
+Accuracyvector<-integer()
+icoll<-integer()
+
+#Performed 10 fold cross validation
+
+for(i in 1:10){
+    icoll<-c(icoll,i)
+#Segement your data by fold using the which() function 
+    testIndexes <- which(folds==i,arr.ind=TRUE)
+    testData <- rs_mandf_frame1[testIndexes, ]
+#print(length(testData$SALE.PRICE))
+    trainData <- rs_mandf_frame1[-testIndexes, ]
+#print(length(trainData$SALE.PRICE))
+    trainres<-(dataframecoll[-testIndexes,])
+    testres<-(dataframecoll[testIndexes,])
+    knn_req<-knn(train=data.frame(trainData),test=data.frame(testData),cl=trainres,k=3)
+#print(knn_req)  Here I have created confusion matrix that give prediction for all the folds
+    GetAccuracy<-confusionMatrix(testres,knn_req)
+#print(confusionMatrix(testres,knn_req))
+#print(GetAccuracy$overall[1])
+    Accuracyvector<-c(Accuracyvector,GetAccuracy$overall[1])
+}
+
+#ploting acccuracy versus fold count
+
+plot(icoll,Accuracyvector,xlab="Fold Counter",ylab="Accuracy")
+sum=0
+for(acc in Accuracyvector)
+{
+  sum=sum+acc
+}
+
+#printed the average accuracy of all the 10 folds
+
+print(sum/length(Accuracyvector))
+
+#Q-2E)
+```
+As we can see that our overall average accuracy for 10 fold cross validation is around 46% which is not significant. So our try should be to improve the accuracy of prediction by creating the folds so that there are more similar and significant values in the same fold .Thus, model can learn from training data set very efficiently and can predict the response variable for test data with more accuracy  
+Also from the graph we can see that for the first 5 folds the accuracy is continuously increasing for k=3 and then after the accuracy is continuously decreasing.From this observation we can say that there are more likely and similar values for the predictors for the 5th and 6th fold than any other fold.We can also say that predictors values are not that good and repetitive in sense for 1st ,2nd  and 10th observation for the purpose of learning model.
+```
+
+#Q-3A)
+#we perform scaling on the variables to perform PCA
+scaled_sales <- data.frame(scale(rs_mandf_modify.sale[,c(13,14,18)], scale = TRUE))
+# we perform PCA analyses on the data
+PCA = prcomp(~GROSS.SQUARE.FEET+LAND.SQUARE.FEET+SALE.PRICE,data=scaled_sales,na.action =na.omit,scale=T)
+str(PCA)
+summary(PCA)
+predict(PCA)
+plot(PCA,type="l")
+
+#Q-3B)
+
+# first we plot the first two vectors
+scr=as.matrix(scaled_sales[,c(1:3)])%*%PCA$rot[,1:2]
+plot(scr,col="blue",ylab="PC2 and PC3")
+#now we plot the first and third vector
+scr_1=as.matrix(scaled_sales[,c(1:3)])%*%PCA$rot[,c(1,3)]
+points(scr_1,col="red")
+
+#we can visualize the 3 vectors using an inbuilt function of r biplot
+
+biplot(PCA)
diff --git a/NYU_DS_Assignment3/CS3943-9223_Assignment3.pdf b/NYU_DS_Assignment3/CS3943-9223_Assignment3.pdf