Skip to content

Commit

Permalink
Adding old versions of Ken's R scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ctwardy committed May 7, 2014
1 parent 4a665b8 commit 9a1c538
Show file tree
Hide file tree
Showing 2 changed files with 339 additions and 0 deletions.
18 changes: 18 additions & 0 deletions AggregateTest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
th<-read.csv("...trade_history_report.csv")
tha<-read.csv("...trade_history_report2014-02-21T12_46_03.csv")

pit <- th$user_id; cat <- as.POSIXct(th$traded_at)
start <- as.POSIXct("2013-12-02 00:00:00 EST"); days <- seq(1,ceiling(as.double(Sys.time()-60*60*24 - start)),1)
np <- nt <- numeric()
for (d in 1:max(days)) {
np[d] <- length(unique(pit[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24]))
nt[d] <- length(cat[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])
}
np.m <- mean(np)

temp <-as.vector(tha$num_trades); npm <- as.double(temp[length(temp)-1]); nta <- as.double(temp[1:(length(temp)-6)])

npm-np.m
npm-sum(np)/(length(np)-2)

length(nta)-length(nt)
321 changes: 321 additions & 0 deletions Recurring Stats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
#Desired Stats and Graphs in order of coding priority: #
# Timing on monthly reports?

# n trades per day graphed over days #
# n conditional trades per day graphed over days #
# n trades per question per day graphed over days #
# n trades per user per day graphed over days #
# n questions open per day graphed over days NEED DATE POSTED
# n questions active (traded or commented) per day graphed over days
# n questions active per category per week graphed over weeks
# n users logged in per week graphed over weeks NO LOGIN DATA!
# retention graph in the style of Ken
# n new users
# n users who haven't been active for more than x weeks
# n comments per day graphed over days #
# cumulative Brier score (averaged over days) from nightly snapshot
# BS distribution (0 to 2) over questions per week graphed over weeks
# dashes? or box and whisker?
# BS distribution over questions per category
# BS distribution over questions per question type
# BS distribution over questions per duration of question (artificial categories)
# BS distribution over questions per time to resolution (artificial categories)
# Hit rate (for binary and multi questions) per week graphed over weeks
# Expected BS over weeks
# Calibration according to various biases

# What about demographics? and other Study 2.3 items?

# distribution of trades per user for specified time frame as graph of user rank v. n trades
# distribution of score per user for specified time frame as graph of user rank v. SciCash
# distribution of trades per question for specified time frame as graph of question rank v. n trades

# n extant links
# n extant links per week graphed over weeks
# n links per open question

#
# Pull csv files from datamart.
# e.g., datamart.scicast.org/trade_history/?format=csv&api_key=XXXXXXX
#
# Read files pulled from DataMart.

setwd("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3")
#library(rjson)
th<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/trade_history_report.csv")
#jth<- fromJSON(readLines("http://datamart.scicast.org/trade_history/?format=json&api_key=XXXXXXX"))
# th <- array(unlist(jth),c(9,length(jth))) # 3 empty variables
qh<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/question_history_report.csv")
#jqh<- fromJSON(readLines("http://datamart.scicast.org/question_history/?format=json&api_key=XXXXXXX"))
# qh <- array(unlist(jqh),c(3,length(jqh)))
qn<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/question_report.csv")
#jqn<- fromJSON(readLines("http://datamart.scicast.org/question/?format=json&api_key=XXXXXXX"))
# qn <- array(unlist(jqn),c(24,length(jqn)))
cm<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/comment_report.csv")
#jcm<- fromJSON(readLines("http://datamart.scicast.org/comment/?format=json&api_key=XXXXXXX"))
# cm <- array(unlist(jcm),c(8,length(jcm))) # 2 empty variables
pr<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/person_report.csv")
#jpr<- fromJSON(readLines("http://datamart.scicast.org/person/?format=json&api_key=XXXXXXX"))
# pr <- array(unlist(jpr),c(13,length(jpr)))

des<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/demographic_survey.csv")
prs<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/profession_survey.csv")
pss<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/psychology_survey.csv")
sks<-read.csv("C:/Users/kolson8/Documents/Psychology/Prediction Market/Y3/skill_survey.csv")

ps1 <- merge(des,prs,by="Custom.Data",all=T); ps2 <- merge(pss,sks,by="Custom.Data",all=T)
prsnl <- merge(ps1,ps2,by="Custom.Data",all=T)

#
# Survey data
sid <- prsnl$Custom.Data; gn <- prsnl$What.is.your.gender.; ag <- prsnl$How.old.are.you.; oc1a <- prsnl$What.is.your.primary.occupation.;
oc1b <- prsnl$X.y; oc2a <- prsnl$What.additional.occupations.have.you.had.; oc2b <- prsnl$X.1.y; expt <- prsnl$In.what.additional.field.s..do.you.have.a.specialty.or.expertise.
pip <- pr$id; pac <- pr$num_trades; us <- pr$username; ac <- rep(0,length(sid))
for (i in 1:length(sid)) {
if (as.character(sid[i])%in%us) {
ac[i] <- pac[us==as.character(sid[i])]
}
}

cn <- c("user","gender","age","occupation1a","oc1b","oc2a","oc2b","expertise","trades")
write.table(data.frame(sid,gn,ag,oc1a,oc1b,oc2a,oc2b,expt,ac),file="personstats.csv",sep=",",append=F,col.names=cn,row.names=F)

#
# Checking on Study 2.1 participants

qit <- th$question_id; pit <- th$user_id
qiq <- qn$id; qr <- qn$roles
pir <- pr$roles
estart <- as.POSIXct("2014-02-21 01:00:00 EST"); edas <- ceiling(as.double(Sys.time()-60*60*24 - estart))
pie <- pip[pir=="Study 2.1A"|pir=="Study 2.1B"]
ea <- qiq[qr=="Study 2.1A"]; eb <- qiq[qr=="Study 2.1B"]; neq <- (length(ea)+length(eb))/2
eqac <- rep(0,length(pie)) # portion of questions in experiment that a person has edited at least once
etpd <- eqac # avg number of trades on experiment questions per day
o1a <- o1b <- o2a <- o2b <- xpt <- numeric()
for (i in 1:length(pie)) {
use[i] <- us[pip==pie[i]]
eqac[i] <- length(qit[pit==pie[i]&(qit%in%ea|qit%in%eb)])/neq
etpd[i] <- length(unique(qit[pit==pie[i]&(qit%in%ea|qit%in%eb)]))/edas
if (us[pip==pie[i]]%in%sid) {
o1a[i] <- oc1a[sid==us[pip==pie[i]]]; o1b[i] <- oc1b[sid==us[pip==pie[i]]]; o2a[i] <- oc2a[sid==us[pip==pie[i]]]; o2b[i] <- oc2b[sid==us[pip==pie[i]]]; xpt[i] <- expt[sid==us[pip==pie[i]]]
}
}
cn <- c("user","id","traded_questions","trades_per_day"occupation1a","oc1b","oc2a","oc2b","expertise")
write.table(data.frame(pie,use,eqac,etpd,o1a,o1b,o2a,o2b,xpt),file="participantstats.csv",sep=",",append=F,col.names=c("user","trades"),row.names=F)
# My own smoothing function!
smth <- function(x) {y<-x; for (i in 3:(length(x)-2)) { y[i] <- x[i-2]*0.1+x[i-1]*0.2+x[i]*0.4+x[i+1]*0.2+x[i+2]*0.1}; return(y)}
#
# Comments per day graph
cac <- as.POSIXct(cm$created_at); qic <- cm$question_id
#cac <- as.POSIXct(cm[6,,],format="%Y-%m-%dT%H:%M:%S"); qic <- cm[8,,]
start <- as.POSIXct("2013-11-25 01:00:00 EST")
days <- seq(1,ceiling(as.double(Sys.time()-60*60*24 - start)),1)
label <- as.character(c(25:30,"Dec 1",2:31,"Jan 1",2:31,"Feb 1",2:28))
nc <- numeric()
for (d in 1:max(days)) {
nc[d] <- length(cac[cac>=start+(d-1)*60*60*24&cac<start+d*60*60*24])
}
png("CpD.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(days,nc,type="l",lwd=3,xaxt="n",ylim=c(0,max(nc)*1.25),ylab="Comments per Day",xlab="Date")
par(las=2)
axis(1,at=days,lab=label[1:length(days)])
dev.off()
#
# Trades per day graph
cat <- as.POSIXct(th$traded_at)
#cat <- as.POSIXct(th[ ],format="%Y-%m-%dT%H:%M:%S")
start <- as.POSIXct("2013-11-25 01:00:00 EST")
#days <- seq(1,ceiling(as.double(Sys.time()-60*60*24 - start)),1)
label <- as.character(c(25:30,"Dec 1",2:31,"Jan 1",2:31,"Feb 1",2:28))
nt <- numeric()
for (d in 1:max(days)) {
nt[d] <- length(cat[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])
}
png("TpD.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(days,nt,type="l",lwd=3,xaxt="n",ylim=c(0,max(nt)*1.25),ylab="Trades per Day",xlab="Date")
par(las=2)
axis(1,at=days,lab=label[1:length(days)])
dev.off()
#
# Activity per question per day graph
qit <- th$question_id
#cat <- as.POSIXct(th$created_at)
#start <- as.POSIXct("2013-11-30 00:00:00 EST")
#days <- seq(1,ceiling(as.double(Sys.time() - start)),1)
#label <- as.character(c(25:30,"Dec 1",2:31,"Jan 1",2:31))
nt.q <- nc.q <- qu <- numeric()
for (d in 1:max(days)) {
# nt[d] <- length(cat[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])
qu[d] <- length(levels(factor(c(qit[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24],qic[cac>=start+(d-1)*60*60*24&cac<start+d*60*60*24]))))
if (qu[d] > 0) {
nt.q[d] <- nt[d]/qu[d]; nc.q[d] <- nc[d]/qu[d]
}
else {nt.q[d] <- nc.q[d] <- 0}
}
qiqh <- qh$question_id; st <- as.POSIXct(qh$sample_time)
#qiqh <- qh[3,]; st <- as.POSIXct(qh[2,],format="%Y-%m-%dT%H:%M:%S")
nq <- numeric()
for (d in 1:max(days)) {
nq[d] <- length(levels(factor(qiqh[st>=start+(d-1)*60*60*24&st<start+d*60*60*24])))
}
png("ApQpD.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(days,nt.q,type="l",lwd=1,col=rgb(0.99,0.6,0.6),xaxt="n",ylim=c(0,ceiling(max(nq)^(1/2))),ylab="",xlab="Date")
lines(smooth.spline(days,smth(nt.q),df=ceiling(length(qu)/3),all.knots=T),lwd=3,col=rgb(0.95,0,0))
lines(days,nc.q,lty=2,lwd=1,col=rgb(0.99,0.6,0.6))
lines(smooth.spline(days,smth(nc.q),df=ceiling(length(qu)/3),all.knots=T),lty=2,lwd=3,col=rgb(0.95,0,0))
lines(days,qu^(1/2),lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(qu^(1/2)),df=ceiling(length(qu)/3),all.knots=T),lwd=3,col=rgb(0,0,1))
lines(days,nq^(1/2),lty=2,lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(nq^(1/2)),df=ceiling(length(qu)/3),all.knots=T),lty=2,lwd=3,col=rgb(0,0,1))
mtext("Activity per Day", outer=T,side=2,line=-1.5,font=1,col=rgb(0.95,0,0))
par(las=2)
axis(1,at=days,lab=label[1:length(days)])
axis(4,at=seq(0,ceiling(max(nq)^(1/2)),2),lab=seq(0,ceiling(max(nq)^(1/2)),2)^2)
par(las=0)
mtext("Questions per Day", outer=T,side=4,line=-1.5,font=1,col=rgb(0,0,1))
text(18,10,"Open Questions",col=rgb(0,0,1),cex=0.9)
text(18,6,"Active Questions",col=rgb(0,0,1),cex=0.9)
text(18,0.75,"Trades per Active Question",col=rgb(0.95,0,0),cex=0.9)
text(18,-0.25,"Comments per Active Question",col=rgb(0.95,0,0),cex=0.9)
dev.off()
#
# Conditional trades per day
as <- th$serialized_assumptions # Conditions set on trades
qit <- th$question_id # Question ID
qiq <- qn$id
ls <- qn$relationships_source_question_id; ld <- qn$relationships_destination_question_id
rq <- qiq[is.na(ld)==F] # Linked questions
#rq <- unique(qit[as!="None"]) # Older Linked questions
ptc <- length(qit[as!="None"])/length(qit[qit%in%rq]) # Portion of Conditional trades on eligible questions
#qiqh <- qh$question_id; st <- as.POSIXct(qh$sample_time)
nq <- nrq <- numeric(); ntc <- rep(0,length(days))
for (d in 1:max(days)) {
nq[d] <- length(levels(factor(qiqh[st>=start+(d-1)*60*60*24&st<start+d*60*60*24])))
nrq[d] <-length(levels(factor(qiqh[qiqh%in%rq&st>=start+(d-1)*60*60*24&st<start+d*60*60*24]))) # Number of linked questions
ntc[d] <-length(qit[as!="None"&cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24]) # Number of conditional trades on eligible questions
# if (nrq[d]>0) {
# ntc[d] <-length(qit[as!="None"&cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])/nrq[d] # Portion of conditional trades on eligible questions
# }
}
png("CTpD.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(days,ntc,type="l",lwd=1,col=rgb(0.99,0.6,0.6),xaxt="n",ylim=c(0,ceiling(max(nq)/6)),ylab="",xlab="Date")
lines(smooth.spline(days,smth(ntc),df=ceiling(length(ntc)/2),all.knots=T),lwd=3,col=rgb(0.95,0,0))
lines(days,nrq/6,lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(nrq/6),df=ceiling(length(ntc)/3),all.knots=T),lwd=3,col=rgb(0,0,1))
lines(days,nq/6,lty=2,lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(nq/6),df=ceiling(length(ntc)/3),all.knots=T),lty=2,lwd=3,col=rgb(0,0,1))
mtext("Trades per Day", outer=T,side=2,line=-1.5,font=1,col=rgb(0.95,0,0))
par(las=2)
axis(1,at=days,lab=label[1:length(days)])
axis(4,at=seq(0,ceiling(max(nq)/6),10),lab=seq(0,ceiling(max(nq)/6),10)*6)
par(las=0)
mtext("Questions per Day", outer=T,side=4,line=-1.5,font=1,col=rgb(0,0,1))
text(20,30,"Open Questions",col=rgb(0,0,1),cex=0.9)
text(20,12,"Linked Questions",col=rgb(0,0,1),cex=0.9)
text(20,6,"Conditional Trades on Linked Questions",col=rgb(0.95,0,0),cex=0.9)
dev.off()
#
# Questions active per category per week graph
weeks <- seq(1,ceiling(as.double(Sys.time()-60*60*24 - start)/7),1)
#qiq <- qh$question_id; st <- as.POSIXct(qh$sample_time)
#nq <- numeric()
ct <- qn$categories; qiq <- qn$id; wt <- rep(0,length(qiq)); cg <- array(numeric(), c(length(qiq),18))
for (j in 1:length(qiq)) {
temp <- levels(factor(strsplit(as.character(ct[j]),",")[[1]]))
wt[j]<-1/length(temp); cg[j,1:(length(temp))]<- temp
}
ac <- levels(factor(as.vector(cg)))
nq.c <- array(rep(0,length(ac)*max(weeks)),c(length(ac),max(weeks)))
for (w in 1:max(weeks)) {
temp <- unique(qiqh[st>=start+(w-1)*60*60*24*7&st<start+w*60*60*24*7])
cw <-numeric(); oc <-array(rep(NA,length(temp)*18),c(length(temp),18))
if (length(temp)>0) {
for (j in 1:length(temp)) {
cw[j]<-wt[qiq==temp[j]]; oc[j,]<-cg[qiq==temp[j]]
}
}
for (i in 1:length(ac)) {
if (length(oc[oc==ac[i]&is.na(oc)==F])>0) {
nq.c[i,w] <- sum(cw[which(oc==ac[i],arr.ind=T)[,1]])
}
}
}
nq.c.stack <- aperm(t(apply(aperm(nq.c), 1, cumsum)))
png("QpCpW.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(weeks,nq.c.stack[1,],type="l",lwd=1,col=rgb(0.2,0.2,0.2),ylim=c(0,max(nq.c.stack)*1.05),ylab="Questions per Week",xlab="Week")
text(3.5,nq.c.stack[1,4]/2,ac[i],col=rgb(0.2,0.2,0.2),cex=0.7)
# par(las=2)
# axis(1,at=weeks)
for (i in 2:length(ac)) {
color<-rgb(rbeta(1,1.5,1.5)*0.85,rbeta(1,1.5,1.5)*0.85,rbeta(1,1.5,1.5)*0.85)
lines(weeks,nq.c.stack[i,],lwd=1,col=color)
par(las=0)
text(3.5,sum(nq.c.stack[(i-1):i,4])/2,ac[i],col=color,cex=0.7)
}
dev.off()
#
# Trades per person per day graph
cap <- as.POSIXct(pr$created_at); pip <- pr$id; pit <- th$user_id
np <- nt.p <- pu <- numeric()
for (d in 1:max(days)) {
# nt[d] <- length(cat[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])
pu[d] <- length(levels(factor(pit[cat>=start+(d-1)*60*60*24&cat<start+d*60*60*24])))
np[d] <- length(pip[cap<start+d*60*60*24])
if (pu[d]>0) {
nt.p[d] <- nt[d]/pu[d]
}
else {nt.p[d] <- 0}
}
png("TpPpD.png", width = 7200, height = 3600, pointsize = 18, res = 360)
par(mar=c(5,4,4,4))
plot(days,nt.p,type="l",lwd=1,col=rgb(0.99,0.6,0.6),xaxt="n",ylim=c(0,ceiling((max(np)^(1/3)))),ylab="",xlab="Date")
lines(smooth.spline(days,smth(nt.p),df=ceiling(length(nt.p)/3),all.knots=T),lwd=3,col=rgb(0.95,0,0))
lines(days,pu^(1/3),lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(pu^(1/3)),df=ceiling(length(pu)/3),all.knots=T),lwd=3,col=rgb(0,0,1))
lines(days,np^(1/3),lty=2,lwd=1,col=rgb(0.6,0.6,1))
lines(smooth.spline(days,smth(np^(1/3)),df=ceiling(length(np)/2),all.knots=T),lty=2,lwd=3,col=rgb(0,0,1))
mtext("Trades per Day", outer=T,side=2,line=-1.5,font=1,col=rgb(0.95,0,0))
par(las=2)
axis(1,at=days,lab=label[1:length(days)])
axis(4,at=seq(0,ceiling((max(np)^(1/3))),1),lab=(seq(0,ceiling((max(np)^(1/3))),1))^3)
par(las=0)
mtext("Users per Day", outer=T,side=4,line=-1.5,font=1,col=rgb(0,0,1))
text(26,4.5,"Trades per Active User",col=rgb(0.95,0,0))
text(26,1,"Active Users",col=rgb(0,0,1))
text(26,7,"All Users",col=rgb(0,0,1))
dev.off()
#
# Trades per question
tq <- numeric()
for (i in 1:length(qiq)) {
tq[i] <- length(qit[qit==qiq[i]])
}
png("TpQ.png", width = 7200, height = 3600, pointsize = 18, res = 360)
plot(qiq,tq,pch=16,col=rgb(0,0,1),ylab="Number of Trades",xaxt="n",xlab="Question ID")
axis(1,at=seq(10,max(qiq),10))
dev.off()

0 comments on commit 9a1c538

Please sign in to comment.