From b6c4bcbb4b3862703140dd9ce9e85f8a8b3a2b46 Mon Sep 17 00:00:00 2001 From: MyPenisIsBig1998 Date: Sun, 16 Jun 2024 02:54:33 +0000 Subject: [PATCH] Random Forest --- Dropout RF.R | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 Dropout RF.R diff --git a/Dropout RF.R b/Dropout RF.R new file mode 100644 index 0000000..df130ef --- /dev/null +++ b/Dropout RF.R @@ -0,0 +1,128 @@ +library(tidyverse) +library(caret) +library(randomForest) + +dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv") +factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target") +dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor) +str(dropout_clean) + +## Random Forest 1 +dropout_clean_rf1=dropout_clean +str(dropout_clean_rf1) + +df.train1 <- sample_frac(dropout_clean_rf1, size = .85) +df.test1 <- setdiff(dropout_clean_rf1, df.train1) + +dropout.rf1 <- randomForest(Target ~ ., + importance = TRUE, + #proximity = TRUE, + data = df.train1) +print(dropout.rf1) +pred1=predict(dropout.rf1, df.test1, type="response") +print(confusionMatrix(as.factor(df.test1$Target), pred1)$overall[1]) + +oob.error.data <- data.frame( + Trees=rep(1:nrow(dropout.rf1$err.rate), times=4), + Type=rep(c("OOB", "Dropout", "Enrolled", + "Graduate"), each=nrow(dropout.rf1$err.rate)), + Error=c(dropout.rf1$err.rate[,"OOB"], + dropout.rf1$err.rate[,"Dropout"], + dropout.rf1$err.rate[,"Enrolled"], + dropout.rf1$err.rate[,"Graduate"])) + +ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + + geom_line(aes(color=Type))+ + ggtitle("Random Forest #1 Error vs Number Of Trees") + + +## Random Forest 2 +dropout_clean_rf2=dropout_clean +str(dropout_clean_rf2) + +df.train2 <- sample_frac(dropout_clean_rf2, size = .85) +df.test2 <- setdiff(dropout_clean_rf2, df.train2) + +dropout.rf2 <- randomForest(Target ~ ., + importance = TRUE, + #proximity = TRUE, + data = df.train2, + mtry=2) +print(dropout.rf2) +pred2=predict(dropout.rf2, df.test2, type="response") +print(confusionMatrix(as.factor(df.test2$Target), pred2)$overall[2]) + +oob.error.data <- data.frame( + Trees=rep(1:nrow(dropout.rf2$err.rate), times=4), + Type=rep(c("OOB", "Dropout", "Enrolled", + "Graduate"), each=nrow(dropout.rf2$err.rate)), + Error=c(dropout.rf2$err.rate[,"OOB"], + dropout.rf2$err.rate[,"Dropout"], + dropout.rf2$err.rate[,"Enrolled"], + dropout.rf2$err.rate[,"Graduate"])) + +ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + + geom_line(aes(color=Type))+ + ggtitle("Random Forest #2 Error vs Number Of Trees") + + + +## Random Forest 3 +dropout_clean_rf3=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", "Dropout", "Did Not Dropout"))) +str(dropout_clean_rf3) + +table(dropout_clean$Target) +table(dropout_clean_rf3$Target) + +df.train3 <- sample_frac(dropout_clean_rf3, size = .85) +df.test3 <- setdiff(dropout_clean_rf3, df.train3) + +dropout.rf3 <- randomForest(Target ~ ., + importance = TRUE, + #proximity = TRUE, + data = df.train3 + #keep.forest=TRUE, + ,mtry=2 + ) +print(dropout.rf3) +pred3=predict(dropout.rf3, df.test3, type="response") +print(confusionMatrix(as.factor(df.test3$Target), pred3)$overall[1]) + +oob.error.data <- data.frame( + Trees=rep(1:nrow(dropout.rf3$err.rate), times=3), + Type=rep(c("OOB", "Dropout", #"Enrolled", + "Did Not Dropout"), each=nrow(dropout.rf3$err.rate)), + Error=c(dropout.rf3$err.rate[,"OOB"], + dropout.rf3$err.rate[,"Dropout"], + #dropout.rf3$err.rate[,"Enrolled"], + dropout.rf3$err.rate[,"Did Not Dropout"])) + +ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + + geom_line(aes(color=Type))+ + ggtitle("Random Forest #3 Error vs Number Of Trees") + + +varImpPlot(dropout.rf3, + main="Variable Importance Plot" + ) + + +oob.values <- vector(length=10) +for(i in 1:10) { + temp.model <- randomForest(Target ~ ., data=df.train1,importance = TRUE, mtry=i) + oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] +} +oob.values +## find the minimum error +min(oob.values) +## 2 + +oob.values <- vector(length=10) +for(i in 1:10) { + temp.model <- randomForest(Target ~ ., data=df.train3, mtry=i) + oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] +} +oob.values +## find the minimum error +min(oob.values) +## 2 \ No newline at end of file