Im_getting_a_fucking_masters/Dropout RF.R

128 lines
4.5 KiB
R
Raw Normal View History

2024-06-15 21:54:33 -05:00
library(tidyverse)
library(caret)
library(randomForest)
dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv")
factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target")
dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor)
str(dropout_clean)
## Random Forest 1
dropout_clean_rf1=dropout_clean
str(dropout_clean_rf1)
df.train1 <- sample_frac(dropout_clean_rf1, size = .85)
df.test1 <- setdiff(dropout_clean_rf1, df.train1)
dropout.rf1 <- randomForest(Target ~ .,
importance = TRUE,
#proximity = TRUE,
data = df.train1)
print(dropout.rf1)
pred1=predict(dropout.rf1, df.test1, type="response")
print(confusionMatrix(as.factor(df.test1$Target), pred1)$overall[1])
oob.error.data <- data.frame(
Trees=rep(1:nrow(dropout.rf1$err.rate), times=4),
Type=rep(c("OOB", "Dropout", "Enrolled",
"Graduate"), each=nrow(dropout.rf1$err.rate)),
Error=c(dropout.rf1$err.rate[,"OOB"],
dropout.rf1$err.rate[,"Dropout"],
dropout.rf1$err.rate[,"Enrolled"],
dropout.rf1$err.rate[,"Graduate"]))
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type))+
ggtitle("Random Forest #1 Error vs Number Of Trees")
## Random Forest 2
dropout_clean_rf2=dropout_clean
str(dropout_clean_rf2)
df.train2 <- sample_frac(dropout_clean_rf2, size = .85)
df.test2 <- setdiff(dropout_clean_rf2, df.train2)
dropout.rf2 <- randomForest(Target ~ .,
importance = TRUE,
#proximity = TRUE,
data = df.train2,
mtry=2)
print(dropout.rf2)
pred2=predict(dropout.rf2, df.test2, type="response")
print(confusionMatrix(as.factor(df.test2$Target), pred2)$overall[2])
oob.error.data <- data.frame(
Trees=rep(1:nrow(dropout.rf2$err.rate), times=4),
Type=rep(c("OOB", "Dropout", "Enrolled",
"Graduate"), each=nrow(dropout.rf2$err.rate)),
Error=c(dropout.rf2$err.rate[,"OOB"],
dropout.rf2$err.rate[,"Dropout"],
dropout.rf2$err.rate[,"Enrolled"],
dropout.rf2$err.rate[,"Graduate"]))
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type))+
ggtitle("Random Forest #2 Error vs Number Of Trees")
## Random Forest 3
dropout_clean_rf3=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", "Dropout", "Did Not Dropout")))
str(dropout_clean_rf3)
table(dropout_clean$Target)
table(dropout_clean_rf3$Target)
df.train3 <- sample_frac(dropout_clean_rf3, size = .85)
df.test3 <- setdiff(dropout_clean_rf3, df.train3)
dropout.rf3 <- randomForest(Target ~ .,
importance = TRUE,
#proximity = TRUE,
data = df.train3
#keep.forest=TRUE,
,mtry=2
)
print(dropout.rf3)
pred3=predict(dropout.rf3, df.test3, type="response")
print(confusionMatrix(as.factor(df.test3$Target), pred3)$overall[1])
oob.error.data <- data.frame(
Trees=rep(1:nrow(dropout.rf3$err.rate), times=3),
Type=rep(c("OOB", "Dropout", #"Enrolled",
"Did Not Dropout"), each=nrow(dropout.rf3$err.rate)),
Error=c(dropout.rf3$err.rate[,"OOB"],
dropout.rf3$err.rate[,"Dropout"],
#dropout.rf3$err.rate[,"Enrolled"],
dropout.rf3$err.rate[,"Did Not Dropout"]))
ggplot(data=oob.error.data, aes(x=Trees, y=Error)) +
geom_line(aes(color=Type))+
ggtitle("Random Forest #3 Error vs Number Of Trees")
varImpPlot(dropout.rf3,
main="Variable Importance Plot"
)
oob.values <- vector(length=10)
for(i in 1:10) {
temp.model <- randomForest(Target ~ ., data=df.train1,importance = TRUE, mtry=i)
oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
}
oob.values
## find the minimum error
min(oob.values)
## 2
oob.values <- vector(length=10)
for(i in 1:10) {
temp.model <- randomForest(Target ~ ., data=df.train3, mtry=i)
oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1]
}
oob.values
## find the minimum error
min(oob.values)
## 2