-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMinimum Occurrence BRTs by treatment.R
860 lines (642 loc) · 37.5 KB
/
Minimum Occurrence BRTs by treatment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
####################################################################################
###### Analysis of winter 2016-17, December - March subnivium presence/absence ######
##### data using Boosted Regression Trees #######
#### FOR DATA WHICH TOOK THE MINIMUM OCCURRENCE OF PRESENCE INSTANCES FROM #######
#### ALL TREATMENT PROBES #######
#####################################################################################
# Minimum Occurrence: If one probe shows subnivium presence, then that day for that treatment is assigned a 1.
# UpdateD to rerun model diagnostics to make sure I am using the correct settings for the models
# in light of the additional changes we decided to make to the parameters
# 1st that we are not including latitude
# 2nd that we are going to include a site & house-specific density that comes from
# our own measures of depth, and SNODAS's measure of external density.
########## clean workspace and load required packages ####################
###########################################################################
#####clean workspace to improve efficiency: ###
rm(list = ls() )
gc() #releases memory
####### load relevant packages ###
library(lubridate)
library( ggplot2 ) #fancy plots
library( dplyr ) #data manipulation
library( dismo ) #Species distribution analysis package. Package used in Elith's 2017 tutorial
library( gbm) #Boosted regression tree package that is used in Elith's 2008 tutorial
########## end of package loading ###########
##############################################################
######## data loading ##########################
##############################################################
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Presence Absence Min Occur Full Data")
presence<-as.data.frame(read.csv(file="02-04-19 Master_min_presence.csv"),
colClasses=c("integer", "character", "integer",
rep("character", 2), rep("numeric", 7), rep("character", 2),
rep("numeric", 8)))
#Fix the dates
presence$Date<-gsub("/", "-", presence$Date)
presence$Date<-parse_date_time(presence$Date, c("%Y-%m-%d", "%m-%d-%Y"), tz = "US/Central")
presence$Date<-as.POSIXct(presence$Date, tz = "US/Central", format = "%Y-%m-%d")
#### Model each treatment separately ####
pres0 <- presence[presence$Treat == "0",]
pres3 <- presence[presence$Treat == "3",]
pres5 <- presence[presence$Treat == "5",]
presext <- presence[presence$Treat == "ext",]
#Add ID column to each dataframe to aid in subsetting
pres0$ID <- seq(from=1, to=1089, by=1)
pres3$ID <- seq(from=1, to=1089, by=1)
pres5$ID <- seq(from=1, to=1089, by=1)
presext$ID <- seq(from=1, to=1089, by=1)
######################################################################################
#### Model Explanation ####
######################################################################################
# Within each treatment dataset, two sets of presence/absence data are necessary,
# one for model training (building) and one for model testing (evaluation).
# Starting with approximately 70% of data for model building and 30% for model testing.
## First, identify the optimal number of trees. ##
## As per the article by Elith and Leathwick ##
## found here, https://cran.r-project.org/web/packages/dismo/vignettes/brt.pdf, look at the ##
## total number of observations you have and the total number of presences, and decide whether ##
## you think there is enough data to model interactions with reasonable complexity. They say that if yes, ##
## then a learning rate of 0.01 could be a reasonable starting point. ##
# data = training dataset, gbm.x = indices of names of predictor columns in data
# gbm.y = index of response variable
# family = use Bernoulli for presence absence (binomial)
# tree complexity = set the complexity of individual trees (models interactions --> 2 = two-way; 3 = three-way)
# learning rate = sets the weight applied to individual trees (lower the learning rate, the better performance in model building)
# bag fraction = controls the stochasticity in the model, bag.frac = 1 results in deterministic model, optimal is between 0.5-0.75
# Keep in mind these models are stochastic and so slightly different each time you run them unless you make them deterministic
# by using a bag fraction of 1.
# Goal is the joint optimization of the number of trees, the learning rate, and tree complexity
# (and minimal predictive deviance)
# IMPORTANT: Splitting the data into training and testing data is necessary in order to more broadly explore
# the optimal combination of number of trees, learning rate, and tree complexity
# Systematically alter tc, lr, and the bag fraction and compare the results.
# Since the external treatment has the most complexity associated with it, do this systematic approach on
# this treatment, and then apply that to the other treatments where things like wind are minimized.
######################################################################################
#### FINDING THE IDEAL SETTINGS FOR THE MODEL USING THE EXTERNAL PROBE DATASET ####
######################################################################################
##### Split data into training and testing datasets with approximately 70/30 split
# Total number of sites sampled in presence/absence data:
M <- max( presext$ID )
# Define number of data points to use for testing:
TP <- 326 # ~30% of data points
# Select which rows to use for testing for our presence/absence data:
t.parows <- sample( x = 1:M, size = TP, replace = FALSE )
presext.train <- presext[ -t.parows, ]
# View
head( presext.train ); dim( presext.train )
# Create testing dataset:
presext.test <- presext[ t.parows, ]
#### Systematic testing of different numbers of trees and learning rates
# Create a list of learning rates to test
lr <- c(0.05, 0.01, 0.005, 0.001, 0.0005)
tree.list <- seq(100, 20000, by=100)
################ TC = 1 ###########################
#Create an empty dataframe in which to store the predicted deviances
deviance.tc1<-data.frame(lr05=numeric(200), lr01=numeric(200), lr005=numeric(200), lr001=numeric(200),
lr0005=numeric(200), nt=seq(100, 20000, by=100))
# Model each different learning rate and store them in a data frame
for (i in 1:length(lr)) {
mod <- gbm.fixed(data=presext.train, gbm.x = c(6, 7, 10:13), gbm.y = 1,
family = "bernoulli", tree.complexity = 1, learning.rate = lr[i],
n.trees = 20000)
# Create matrix of predictions, each column = predictions from the mode - for example, the predictions
# in column 5 are for tree.list[5]=500 trees
pred <- gbm :: predict.gbm(mod, presext.test, n.trees = tree.list, type="response")
# Calculate deviance of all of these results and store them in a dataframe:
for (j in 1:length(deviance.tc1[,i])) {
deviance.tc1[,i][j] <- calc.deviance(presext.test$Presence, pred[,j], calc.mean = T)
}
}
min(deviance.tc1, na.rm=TRUE)
which(deviance.tc1$lr05 == min(deviance.tc1, na.rm=TRUE))
deviance.tc1[6,]
#Minimum is 0.790 at 600 trees at lr of 0.05
# Graph the results of the learning rate comparison for tc=1
tc1 <- ggplot() +
geom_line(data=deviance.tc1, aes(x=nt, y=lr05, linetype="0.05"), size=1) +
geom_line(data=deviance.tc1, aes(x=nt, y=lr01, linetype="0.01"), size=1) +
geom_line(data=deviance.tc1, aes(x=nt, y=lr005, linetype="0.005"), size=1) +
geom_line(data=deviance.tc1, aes(x=nt, y=lr001, linetype="0.001"), size=1) +
geom_line(data=deviance.tc1, aes(x=nt, y=lr0005, linetype="0.0005"), size=1) +
geom_hline(yintercept=min(deviance.tc1[,c(1:5)]), linetype="solid", color="red") +
geom_vline(xintercept=600, linetype="solid", color="green") +
#geom_errorbar(width=0.2, size=2) + geom_point(size=8) +
theme_bw() +
theme(axis.text.x = element_text(size=20, face="bold")) +
theme(axis.text.y = element_text(size=20, face="bold")) +
theme(axis.title.x = element_text(size=22, face="bold", color="gray30")) +
theme(axis.title.y = element_text(size=22, face="bold", color="gray30")) +
scale_x_continuous(name="\nNumber of Trees", limits=c(0, 20000),
breaks=c(0, 5000, 10000, 15000, 20000)) +
scale_y_continuous(name="Predictive Deviance\n") +
scale_linetype_manual(name='', values = c('0.05' = "twodash",
'0.01' = "solid",
'0.005' = "longdash",
'0.001' = "dotted",
'0.0005' = "dotdash"),
labels = c("0.0005", "0.001", "0.005", "0.01", "0.05")) +
theme(aspect.ratio=1) + #aspect ratio expressed as y/x
# theme(legend.position = c(0.85, 0.60)) +
theme(legend.position = "none") +
theme(legend.text = element_text(size=14)) +
guides(colour = guide_legend(override.aes = (list(size=3)))) +
annotate("text", x = 300, y = 1.18, label = "(a)",
size=7) +
annotate("text", x = 3500, y = 1.18, label = "tc = 1",
fontface="italic", size=7) +
annotate("text", x = 14500, y = 1.18, label = "min=0.79, lr=0.05",
fontface="italic", size=7) +
annotate("text", x = 17500, y = 1.14, label = "nt=600",
fontface="italic", size=7)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/02-04-19 Minimum Occurrence")
ggsave("Ext - Deviance tc1 nt20000.jpg", plot=tc1, device = "jpeg",
width=7, height=5, dpi=300)
################ TC = 2 ###########################
#Create an empty dataframe in which to store the predicted deviances
deviance.tc2<-data.frame(lr05=numeric(200), lr01=numeric(200), lr005=numeric(200), lr001=numeric(200),
lr0005=numeric(200), nt=seq(100, 20000, by=100))
# Model each different learning rate and store them in a data frame
for (i in 1:length(lr)) {
mod <- gbm.fixed(data=presext.train, gbm.x = c(6, 7, 10:13), gbm.y = 1,
family = "bernoulli", tree.complexity = 2, learning.rate = lr[i],
n.trees = 20000)
# Create matrix of predictions, each column = predictions from the mode - for example, the predictions
# in column 5 are for tree.list[5]=500 trees
pred <- gbm :: predict.gbm(mod, presext.test, n.trees = tree.list, type="response")
# Calculate deviance of all of these results and store them in a dataframe:
for (j in 1:length(deviance.tc2[,i])) {
deviance.tc2[,i][j] <- calc.deviance(presext.test$Presence, pred[,j], calc.mean = T)
}
}
min(deviance.tc2, na.rm=TRUE)
which(deviance.tc2$lr01 == min(deviance.tc2, na.rm=TRUE))
deviance.tc2[31,]
#Minimum is 0.70 at 3100 trees at lr of 0.01
# Graph the results of the learning rate comparison for tc=1
tc2 <- ggplot() +
geom_line(data=deviance.tc2, aes(x=nt, y=lr05, linetype="0.05"), size=1) +
geom_line(data=deviance.tc2, aes(x=nt, y=lr01, linetype="0.01"), size=1) +
geom_line(data=deviance.tc2, aes(x=nt, y=lr005, linetype="0.005"), size=1) +
geom_line(data=deviance.tc2, aes(x=nt, y=lr001, linetype="0.001"), size=1) +
geom_line(data=deviance.tc2, aes(x=nt, y=lr0005, linetype="0.0005"), size=1) +
geom_hline(yintercept=min(deviance.tc2[,c(1:5)], na.rm=TRUE), linetype="solid", color="red") +
geom_vline(xintercept=3100, linetype="solid", color="green") +
#geom_errorbar(width=0.2, size=2) + geom_point(size=8) +
theme_bw() +
theme(axis.text.x = element_text(size=20, face="bold")) +
theme(axis.text.y = element_text(size=20, face="bold")) +
theme(axis.title.x = element_text(size=22, face="bold", color="gray30")) +
theme(axis.title.y = element_text(size=22, face="bold", color="gray30")) +
scale_x_continuous(name="\nNumber of Trees", limits=c(0, 20000),
breaks=c(0, 5000, 10000, 15000, 20000)) +
scale_y_continuous(name="Predictive Deviance\n", breaks=c(0.9, 1.2, 1.5)) +
scale_linetype_manual(name='', values = c('0.05' = "twodash",
'0.01' = "solid",
'0.005' = "longdash",
'0.001' = "dotted",
'0.0005' = "dotdash"),
labels = c("0.0005", "0.001", "0.005", "0.01", "0.05")) +
theme(aspect.ratio=1) + #aspect ratio expressed as y/x
theme(legend.position = c(0.85, 0.50)) +
# theme(legend.position = "none") +
theme(legend.text = element_text(size=14)) +
guides(colour = guide_legend(override.aes = (list(size=3)))) +
annotate("text", x = 300, y = 1.75, label = "(b)",
size=7) +
annotate("text", x = 3500, y = 1.75, label = "tc = 2",
fontface="italic", size=7) +
annotate("text", x = 14500, y = 1.75, label = "min=0.70, lr=0.01",
fontface="italic", size=7) +
annotate("text", x = 17300, y = 1.63, label = "nt=3100",
fontface="italic", size=7)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/02-04-19 Minimum Occurrence")
ggsave("Ext - Deviance tc2 nt20000.jpg", plot=tc2, device = "jpeg",
width=7, height=5, dpi=300)
################ TC = 3 ###########################
#Create an empty dataframe in which to store the predicted deviances
deviance.tc3<-data.frame(lr05=numeric(200), lr01=numeric(200), lr005=numeric(200), lr001=numeric(200),
lr0005=numeric(200), nt=seq(100, 20000, by=100))
# Model each different learning rate and store them in a data frame
for (i in 1:length(lr)) {
mod <- gbm.fixed(data=presext.train, gbm.x = c(6, 7, 10:13), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = lr[i],
n.trees = 20000)
# Create matrix of predictions, each column = predictions from the mode - for example, the predictions
# in column 5 are for tree.list[5]=500 trees
pred <- gbm :: predict.gbm(mod, presext.test, n.trees = tree.list, type="response")
# Calculate deviance of all of these results and store them in a dataframe:
for (j in 1:length(deviance.tc3[,i])) {
deviance.tc3[,i][j] <- calc.deviance(presext.test$Presence, pred[,j], calc.mean = T)
}
}
min(deviance.tc3, na.rm=TRUE)
which(deviance.tc3$lr01 == min(deviance.tc3, na.rm=TRUE))
deviance.tc3[23,]
#Minimum is 0.65 at 2300 trees at lr of 0.01
# Graph the results of the learning rate comparison for tc=3
tc3 <- ggplot() +
geom_line(data=deviance.tc3, aes(x=nt, y=lr05, linetype="0.05"), size=1) +
geom_line(data=deviance.tc3, aes(x=nt, y=lr01, linetype="0.01"), size=1) +
geom_line(data=deviance.tc3, aes(x=nt, y=lr005, linetype="0.005"), size=1) +
geom_line(data=deviance.tc3, aes(x=nt, y=lr001, linetype="0.001"), size=1) +
geom_line(data=deviance.tc3, aes(x=nt, y=lr0005, linetype="0.0005"), size=1) +
geom_hline(yintercept=min(deviance.tc3[,c(1:5)], na.rm=TRUE), linetype="solid", color="red") +
geom_vline(xintercept=2300, linetype="solid", color="green") +
#geom_errorbar(width=0.2, size=2) + geom_point(size=8) +
theme_bw() +
theme(axis.text.x = element_text(size=20, face="bold")) +
theme(axis.text.y = element_text(size=20, face="bold")) +
theme(axis.title.x = element_text(size=22, face="bold", color="gray30")) +
theme(axis.title.y = element_text(size=22, face="bold", color="gray30")) +
scale_x_continuous(name="\nNumber of Trees", limits=c(0, 20000),
breaks=c(0, 5000, 10000, 15000, 20000)) +
scale_y_continuous(name="Predictive Deviance\n") +
scale_linetype_manual(name='', values = c('0.05' = "twodash",
'0.01' = "solid",
'0.005' = "longdash",
'0.001' = "dotted",
'0.0005' = "dotdash"),
labels = c("0.0005", "0.001", "0.005", "0.01", "0.05")) +
theme(aspect.ratio=1) + #aspect ratio expressed as y/x
# theme(legend.position = c(0.85, 0.60)) +
theme(legend.position = "none") +
theme(legend.text = element_text(size=14)) +
guides(colour = guide_legend(override.aes = (list(size=3)))) +
annotate("text", x = 300, y = 2, label = "(c)",
size=7) +
annotate("text", x = 3500, y = 2, label = "tc = 3",
fontface="italic", size=7) +
annotate("text", x = 14500, y = 2, label = "min=0.65, lr=0.01",
fontface="italic", size=7) +
annotate("text", x = 17300, y = 1.85, label = "nt=2300",
fontface="italic", size=7)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/02-04-19 Minimum Occurrence")
ggsave("Ext - Deviance tc3 nt20000.jpg", plot=tc3, device = "jpeg",
width=7, height=5, dpi=300)
################ TC = 4 ###########################
#Create an empty dataframe in which to store the predicted deviances
deviance.tc4<-data.frame(lr05=numeric(200), lr01=numeric(200), lr005=numeric(200), lr001=numeric(200),
lr0005=numeric(200), nt=seq(100, 20000, by=100))
# Model each different learning rate and store them in a data frame
for (i in 1:length(lr)) {
mod <- gbm.fixed(data=presext.train, gbm.x = c(6, 7, 10:13), gbm.y = 1,
family = "bernoulli", tree.complexity = 4, learning.rate = lr[i],
n.trees = 20000)
# Create matrix of predictions, each column = predictions from the mode - for example, the predictions
# in column 5 are for tree.list[5]=500 trees
pred <- gbm :: predict.gbm(mod, presext.test, n.trees = tree.list, type="response")
# Calculate deviance of all of these results and store them in a dataframe:
for (j in 1:length(deviance.tc4[,i])) {
deviance.tc4[,i][j] <- calc.deviance(presext.test$Presence, pred[,j], calc.mean = T)
}
}
min(deviance.tc4, na.rm=TRUE)
which(deviance.tc4$lr01 == min(deviance.tc4, na.rm=TRUE))
deviance.tc4[19,]
#Minimum is 0.62 at 1900 trees at lr of 0.01
# Graph the results of the learning rate comparison for tc=1
tc4 <- ggplot() +
geom_line(data=deviance.tc4, aes(x=nt, y=lr05, linetype="0.05"), size=1) +
geom_line(data=deviance.tc4, aes(x=nt, y=lr01, linetype="0.01"), size=1) +
geom_line(data=deviance.tc4, aes(x=nt, y=lr005, linetype="0.005"), size=1) +
geom_line(data=deviance.tc4, aes(x=nt, y=lr001, linetype="0.001"), size=1) +
geom_line(data=deviance.tc4, aes(x=nt, y=lr0005, linetype="0.0005"), size=1) +
geom_hline(yintercept=min(deviance.tc4[,c(1:5)], na.rm=TRUE), linetype="solid", color="red") +
geom_vline(xintercept=1900, linetype="solid", color="green") +
#geom_errorbar(width=0.2, size=2) + geom_point(size=8) +
theme_bw() +
theme(axis.text.x = element_text(size=20, face="bold")) +
theme(axis.text.y = element_text(size=20, face="bold")) +
theme(axis.title.x = element_text(size=22, face="bold", color="gray30")) +
theme(axis.title.y = element_text(size=22, face="bold", color="gray30")) +
scale_x_continuous(name="\nNumber of Trees", limits=c(0, 20000),
breaks=c(0, 5000, 10000, 15000, 20000)) +
scale_y_continuous(name="Predictive Deviance\n") +
scale_linetype_manual(name='', values = c('0.05' = "twodash",
'0.01' = "solid",
'0.005' = "longdash",
'0.001' = "dotted",
'0.0005' = "dotdash"),
labels = c("0.0005", "0.001", "0.005", "0.01", "0.05")) +
theme(aspect.ratio=1) + #aspect ratio expressed as y/x
# theme(legend.position = c(0.85, 0.60)) +
theme(legend.position = "none") +
theme(legend.text = element_text(size=14)) +
guides(colour = guide_legend(override.aes = (list(size=3)))) +
annotate("text", x = 300, y = 1.95, label = "(d)",
size=7) +
annotate("text", x = 3500, y = 1.95, label = "tc = 4",
fontface="italic", size=7) +
annotate("text", x = 14500, y = 1.95, label = "min=0.62, lr=0.01",
fontface="italic", size=7) +
annotate("text", x = 17300, y = 1.80, label = "nt=1900",
fontface="italic", size=7)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/02-04-19 Minimum Occurrence")
ggsave("Ext - Deviance tc4 nt20000.jpg", plot=tc4, device = "jpeg",
width=7, height=5, dpi=300)
######## Based on the above systematic alteration of the learning rates and tree ######
######## complexity, the optimal combination of settings is tc=3, lr=0.005, and ######
######## nt= 5200 for the EXTERNAL PROBES. TC4 also has a lower deviance and ######
######## it does have at least 1000 trees, but thinking about a tree complexity #####
######## meaning essentially a 4-way interaction seems excessive.
######################### BAG FRACTION = STOCHASTICITY ###############################
# Now vary the bag fraction
bf<-c(0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75)
#for the first potential tc and lr combination (tc=3 and lr=0.005)
#Create an empty dataframe in which to store the predicted deviances
deviance.bf<-data.frame(bf25=numeric(200), bf30=numeric(200), bf35=numeric(200), bf40=numeric(200),
bf45=numeric(200), bf50=numeric(200), bf55=numeric(200), bf60=numeric(200),
bf65=numeric(200), bf70=numeric(200), bf75=numeric(200), nt=seq(100, 20000, by=100))
# Model each different learning rate and store them in a data frame
for (i in 1:length(bf)) {
mod <- gbm.fixed(data=presext.train, gbm.x = c(6, 7, 10:13), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = 0.005,
n.trees = 20000, bag.fraction = bf[i])
# Create matrix of predictions, each column = predictions from the mode
pred <- gbm :: predict.gbm(mod, presext.test, n.trees = tree.list, type="response")
# Calculate deviance of all of these results and store them in a dataframe:
for (j in 1:length(deviance.bf[,i])) {
deviance.bf[,i][j] <- calc.deviance(presext.test$Presence, pred[,j], calc.mean = T)
}
}
min(deviance.bf, na.rm=TRUE)
which(deviance.bf[,5] == min(deviance.bf, na.rm=TRUE))
deviance.bf[54,]
#Minimum is 0.654 at 4600 trees at bf of 0.45 - 02/05/19
# Rerunning plots for manuscript - minimum is 0.65 at 5400 trees at bf of 0.45
# Graph the results of the bag fraction comparison
bf <- ggplot() +
geom_line(data=deviance.bf, aes(x=nt, y=bf25, color="bf25"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf30, color="bf30"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf35, color="bf35"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf40, color="bf40"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf45, color="bf45"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf50, color="bf50"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf55, color="bf55"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf60, color="bf60"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf65, color="bf65"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf70, color="bf70"), size=1) +
geom_line(data=deviance.bf, aes(x=nt, y=bf75, color="bf75"), size=1) +
geom_hline(yintercept=min(deviance.bf[,c(1:11)], na.rm=TRUE), linetype="solid", color="red") +
geom_vline(xintercept=5400, linetype="solid", color="green") +
#geom_errorbar(width=0.2, size=2) + geom_point(size=8) +
theme_bw() +
theme(axis.text.x = element_text(size=20, face="bold")) +
theme(axis.text.y = element_text(size=20, face="bold")) +
theme(axis.title.x = element_text(size=22, face="bold", color="gray30")) +
theme(axis.title.y = element_text(size=22, face="bold", color="gray30")) +
scale_x_continuous(name="\nNumber of Trees", limits=c(0, 20000),
breaks=c(0, 5000, 10000, 15000, 20000)) +
scale_y_continuous(name="Predictive Deviance\n") +
scale_colour_manual(name='', values = c('bf25' = "red",
'bf30' = "orange",
'bf35' = "yellow",
'bf40' = "green",
'bf45' = "blue",
'bf50' = "purple",
'bf55' = "darkgreen",
'bf60' = "turquoise2",
'bf65' = "brown3",
'bf70' = "darkgoldenrod3",
'bf75' = "hotpink"),
labels = c("0.25", "0.30", "0.35", "0.40", "0.45", "0.50",
"0.55", "0.60", "0.65", "0.70", "0.75")) +
theme(aspect.ratio=1) + #aspect ratio expressed as y/x
# theme(legend.position = c(0.85, 0.60)) +
# theme(legend.position = "none") +
theme(legend.text = element_text(size=14)) +
guides(colour = guide_legend(override.aes = (list(size=3)))) +
# annotate("text", x = 2200, y = 1, label = "bf = 0.45",
# fontface="italic", size=7) +
# annotate("text", x = 12500, y = 1, label = "min=0.65, lr=0.01",
# fontface="italic", size=7) +
# annotate("text", x = 12500, y = 0.9, label = "nt=5400, tc=3",
# fontface="italic", size=7)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/02-04-19 Minimum Occurrence")
ggsave("Ext - Deviance BF nt20000.jpg", plot=bf, device = "jpeg",
width=7, height=5, dpi=300)
# Based on the above, ideal settings are as follows (because deviance is slightly minimized with a tree complexity of 3):
# learning rate = 0.005
# tree complexity = 3
# bag fraction = 0.45
######################################################################################
#### External Model ####
######################################################################################
#Rerun test and training separation so that new random sets (different from those used
#above in the optimal settings identification) can be generated
##### Split data into training and testing datasets with approximately 70/30 split
# Total number of sites sampled in presence/absence data:
M <- max( presext$ID )
# Define number of data points to use for testing:
TP <- 326 # ~30% of data points
# Select which rows to use for testing for our presence/absence data:
t.parows <- sample( x = 1:M, size = TP, replace = FALSE )
presext.train <- presext[ -t.parows, ]
# View
head( presext.train ); dim( presext.train )
# Create testing dataset:
presext.test <- presext[ t.parows, ]
########## Model
#BRT model with optimal settings identified from first part of code
modext <- gbm.step(data=presext.train, gbm.x = c(6, 7, 10, 11, 13, 22), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = 0.005,
max.trees = 7500, bag.fraction = 0.45)
# Be careful of partial dependence plots (plotting the effect of each single variable)
# if predictors are correlated or if there are interactions
###### Identifying interactions ######
find.int <- dismo::gbm.interactions(modext)
find.int$interactions
find.int$rank.list
gbm.perspec(modext, 1, 6)
gbm.perspec(modext, 5, 3)
#Variable importance
modext$contributions
#partial dependence plots
gbm.plot(modext)
### Predicting to testing data - MODEL evaluation ###
preds <- gbm::predict.gbm(modext, presext.test, n.trees=modext$gbm.call$best.trees, type="response")
calc.deviance(obs=presext.test$Presence, pred=preds, calc.mean=TRUE)
d <- cbind(presext.test$Presence, preds)
pres <- d[d[,1]==1, 2]
abs <- d[d[,1]==0, 2]
e <- evaluate(p=pres, a=abs)
e
plot(e, 'ROC')
# Save this graph in case I need to present them in supplemental materials
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/AUC Plots/Minimum Occurrence")
jpeg('External Model AUC Curve.jpg', width=7, height=5, units= 'in', res=300)
plot(e, 'ROC')
dev.off()
modext$cv.statistics
# In order to make prettier graphs of any interactions and/or partial depency plots, make a new dataframe that has the
# training data, and the fitted values. Should then be able to use them with ggplot.
# Add columns for fitted, fitted.vars, and residuals
ext <- cbind(presext.train, modext$fitted, modext$fitted.vars, modext$residuals, modext$fit)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Modeling Results/Fitted values for graphing/Minimum Occurrence")
write.csv(ext, "External Model Fitted Values.csv", row.names = FALSE)
######################################################################################
#### House 0 Model ####
######################################################################################
##### Split data into training and testing datasets with approximately 70/30 split
# Total number of sites sampled in presence/absence data:
M <- max( pres0$ID )
# Define number of data points to use for testing:
TP <- 326 # ~30% of data points
# Select which rows to use for testing for our presence/absence data:
t.parows <- sample( x = 1:M, size = TP, replace = FALSE )
pres0.train <- pres0[ -t.parows, ]
# View
head( pres0.train ); dim( pres0.train )
# Create testing dataset:
pres0.test <- pres0[ t.parows, ]
########## Model
#BRT model with optimal settings identified from first part of code
mod0 <- gbm.step(data=pres0.train, gbm.x = c(6, 7, 10, 11, 13, 22), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = 0.005,
max.trees = 7500, bag.fraction = 0.45)
# Be careful of partial dependence plots (plotting the effect of each single variable)
# if predictors are correlated or if there are interactions
###### Identifying interactions ######
find.int <- dismo::gbm.interactions(mod0)
find.int$interactions
find.int$rank.list
gbm.perspec(mod0, 5, 3)
#Variable importance
mod0$contributions
#partial dependence plots
gbm.plot(mod0)
### Predicting to testing data - MODEL evaluation ###
preds <- gbm::predict.gbm(mod0, pres0.test, n.trees=mod0$gbm.call$best.trees, type="response")
calc.deviance(obs=pres0.test$Presence, pred=preds, calc.mean=TRUE)
d <- cbind(pres0.test$Presence, preds)
pres <- d[d[,1]==1, 2]
abs <- d[d[,1]==0, 2]
e <- evaluate(p=pres, a=abs)
e
plot(e, 'ROC')
# Save this graph in case I need to present them in supplemental materials
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/AUC Plots/Minimum Occurrence")
jpeg('House 0 Model AUC Curve.jpg', width=7, height=5, units= 'in', res=300)
plot(e, 'ROC')
dev.off()
mod0$cv.statistics
# In order to make prettier graphs of any interactions and/or partial depency plots, make a new dataframe that has the
# training data, and the fitted values. Should then be able to use them with ggplot.
# Add columns for fitted, fitted.vars, and residuals
H0 <- cbind(pres0.train, mod0$fitted, mod0$fitted.vars, mod0$residuals, mod0$fit)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Modeling Results/Fitted values for graphing/Minimum Occurrence")
write.csv(H0, "House 0 Model Fitted Values.csv", row.names = FALSE)
######################################################################################
#### House 3 Model ####
######################################################################################
##### Split data into training and testing datasets with approximately 70/30 split
# Total number of sites sampled in presence/absence data:
M <- max( pres3$ID )
# Define number of data points to use for testing:
TP <- 326 # ~30% of data points
# Select which rows to use for testing for our presence/absence data:
t.parows <- sample( x = 1:M, size = TP, replace = FALSE )
pres3.train <- pres3[ -t.parows, ]
# View
head( pres3.train ); dim( pres3.train )
# Create testing dataset:
pres3.test <- pres3[ t.parows, ]
########## Model
#BRT model with optimal settings identified from first part of code
mod3 <- gbm.step(data=pres3.train, gbm.x = c(6, 7, 10, 11, 13, 22), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = 0.005,
max.trees = 7500, bag.fraction = 0.45)
# Be careful of partial dependence plots (plotting the effect of each single variable)
# if predictors are correlated or if there are interactions
###### Identifying interactions ######
find.int <- dismo::gbm.interactions(mod3)
find.int$interactions
find.int$rank.list
gbm.perspec(mod3, 5, 3)
gbm.perspec(mod3, 6, 3)
#Variable importance
mod3$contributions
#partial dependence plots
gbm.plot(mod3)
### Predicting to testing data - MODEL evaluation ###
preds <- gbm::predict.gbm(mod3, pres3.test, n.trees=mod3$gbm.call$best.trees, type="response")
calc.deviance(obs=pres3.test$Presence, pred=preds, calc.mean=TRUE)
d <- cbind(pres3.test$Presence, preds)
pres <- d[d[,1]==1, 2]
abs <- d[d[,1]==0, 2]
e <- evaluate(p=pres, a=abs)
e
plot(e, 'ROC')
# Save this graph in case I need to present them in supplemental materials
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/AUC Plots/Minimum Occurrence")
jpeg('House 3 Model AUC Curve.jpg', width=7, height=5, units= 'in', res=300)
plot(e, 'ROC')
dev.off()
mod3$cv.statistics
# In order to make prettier graphs of any interactions and/or partial depency plots, make a new dataframe that has the
# training data, and the fitted values. Should then be able to use them with ggplot.
# Add columns for fitted, fitted.vars, and residuals
H3 <- cbind(pres3.train, mod3$fitted, mod3$fitted.vars, mod3$residuals, mod3$fit)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Modeling Results/Fitted values for graphing/Minimum Occurrence")
write.csv(H3, "House 3 Model Fitted Values.csv", row.names = FALSE)
######################################################################################
#### House 5 Model ####
######################################################################################
##### Split data into training and testing datasets with approximately 70/30 split
# Total number of sites sampled in presence/absence data:
M <- max( pres5$ID )
# Define number of data points to use for testing:
TP <- 326 # ~30% of data points
# Select which rows to use for testing for our presence/absence data:
t.parows <- sample( x = 1:M, size = TP, replace = FALSE )
pres5.train <- pres5[ -t.parows, ]
# View
head( pres5.train ); dim( pres5.train )
# Create testing dataset:
pres5.test <- pres5[ t.parows, ]
########## Model
#BRT model with optimal settings identified from first part of code
mod5 <- gbm.step(data=pres5.train, gbm.x = c(6, 7, 10, 11, 13, 22), gbm.y = 1,
family = "bernoulli", tree.complexity = 3, learning.rate = 0.005,
max.trees = 7500, bag.fraction = 0.45)
# Be careful of partial dependence plots (plotting the effect of each single variable)
# if predictors are correlated or if there are interactions
###### Identifying interactions ######
find.int <- dismo::gbm.interactions(mod5)
find.int$interactions
find.int$rank.list
gbm.perspec(mod5, 3, 6)
gbm.perspec(mod5, 6, 3)
#Variable importance
mod5$contributions
#partial dependence plots
gbm.plot(mod5)
### Predicting to testing data - MODEL evaluation ###
preds <- gbm::predict.gbm(mod5, pres5.test, n.trees=mod5$gbm.call$best.trees, type="response")
calc.deviance(obs=pres5.test$Presence, pred=preds, calc.mean=TRUE)
d <- cbind(pres5.test$Presence, preds)
pres <- d[d[,1]==1, 2]
abs <- d[d[,1]==0, 2]
e <- evaluate(p=pres, a=abs)
e
plot(e, 'ROC')
# Save this graph in case I need to present them in supplemental materials
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Graphs/BRT/AUC Plots/Minimum Occurrence")
jpeg('House 5 Model AUC Curve.jpg', width=7, height=5, units= 'in', res=300)
plot(e, 'ROC')
dev.off()
mod5$cv.statistics
# In order to make prettier graphs of any interactions and/or partial depency plots, make a new dataframe that has the
# training data, and the fitted values. Should then be able to use them with ggplot.
# Add columns for fitted, fitted.vars, and residuals
H5 <- cbind(pres5.train, mod5$fitted, mod5$fitted.vars, mod5$residuals, mod5$fit)
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Modeling Results/Fitted values for graphing/Minimum Occurrence")
write.csv(H5, "House 5 Model Fitted Values.csv", row.names = FALSE)
########################################################################################
##### SAVE MODEL OBJECTS SO THAT THEY CAN BE RELOADED FOR PREDICTIONS LATER ############
########################################################################################
setwd("L:/LabMemberFolders/KimberlyThompson/Ch 2 Climate/Analysis/Modeling Results/Model Objects/Minimum Occurrence")
saveRDS(modext, "External_BRT_Model.rds")
saveRDS(mod0, "House0_BRT_Model.rds")
saveRDS(mod3, "House3_BRT_Model.rds")
saveRDS(mod5, "House5_BRT_Model.rds")