library(tidyverse) library(caret) library(randomForest) dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv") factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target") dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor) str(dropout_clean) ## Random Forest 1 dropout_clean_rf1=dropout_clean str(dropout_clean_rf1) df.train1 <- sample_frac(dropout_clean_rf1, size = .85) df.test1 <- setdiff(dropout_clean_rf1, df.train1) dropout.rf1 <- randomForest(Target ~ ., importance = TRUE, #proximity = TRUE, data = df.train1) print(dropout.rf1) pred1=predict(dropout.rf1, df.test1, type="response") print(confusionMatrix(as.factor(df.test1$Target), pred1)$overall[1]) oob.error.data <- data.frame( Trees=rep(1:nrow(dropout.rf1$err.rate), times=4), Type=rep(c("OOB", "Dropout", "Enrolled", "Graduate"), each=nrow(dropout.rf1$err.rate)), Error=c(dropout.rf1$err.rate[,"OOB"], dropout.rf1$err.rate[,"Dropout"], dropout.rf1$err.rate[,"Enrolled"], dropout.rf1$err.rate[,"Graduate"])) ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type))+ ggtitle("Random Forest #1 Error vs Number Of Trees") ## Random Forest 2 dropout_clean_rf2=dropout_clean str(dropout_clean_rf2) df.train2 <- sample_frac(dropout_clean_rf2, size = .85) df.test2 <- setdiff(dropout_clean_rf2, df.train2) dropout.rf2 <- randomForest(Target ~ ., importance = TRUE, #proximity = TRUE, data = df.train2, mtry=2) print(dropout.rf2) pred2=predict(dropout.rf2, df.test2, type="response") print(confusionMatrix(as.factor(df.test2$Target), pred2)$overall[2]) oob.error.data <- data.frame( Trees=rep(1:nrow(dropout.rf2$err.rate), times=4), Type=rep(c("OOB", "Dropout", "Enrolled", "Graduate"), each=nrow(dropout.rf2$err.rate)), Error=c(dropout.rf2$err.rate[,"OOB"], dropout.rf2$err.rate[,"Dropout"], dropout.rf2$err.rate[,"Enrolled"], dropout.rf2$err.rate[,"Graduate"])) ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type))+ ggtitle("Random Forest #2 Error vs Number Of Trees") ## Random Forest 3 dropout_clean_rf3=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", "Dropout", "Did Not Dropout"))) str(dropout_clean_rf3) table(dropout_clean$Target) table(dropout_clean_rf3$Target) df.train3 <- sample_frac(dropout_clean_rf3, size = .85) df.test3 <- setdiff(dropout_clean_rf3, df.train3) dropout.rf3 <- randomForest(Target ~ ., importance = TRUE, #proximity = TRUE, data = df.train3 #keep.forest=TRUE, ,mtry=2 ) print(dropout.rf3) pred3=predict(dropout.rf3, df.test3, type="response") print(confusionMatrix(as.factor(df.test3$Target), pred3)$overall[1]) oob.error.data <- data.frame( Trees=rep(1:nrow(dropout.rf3$err.rate), times=3), Type=rep(c("OOB", "Dropout", #"Enrolled", "Did Not Dropout"), each=nrow(dropout.rf3$err.rate)), Error=c(dropout.rf3$err.rate[,"OOB"], dropout.rf3$err.rate[,"Dropout"], #dropout.rf3$err.rate[,"Enrolled"], dropout.rf3$err.rate[,"Did Not Dropout"])) ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type))+ ggtitle("Random Forest #3 Error vs Number Of Trees") varImpPlot(dropout.rf3, main="Variable Importance Plot" ) oob.values <- vector(length=10) for(i in 1:10) { temp.model <- randomForest(Target ~ ., data=df.train1,importance = TRUE, mtry=i) oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] } oob.values ## find the minimum error min(oob.values) ## 2 oob.values <- vector(length=10) for(i in 1:10) { temp.model <- randomForest(Target ~ ., data=df.train3, mtry=i) oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] } oob.values ## find the minimum error min(oob.values) ## 2