library(tidyverse) library(caret) library(neuralnet) normalize = function(x) { return ((x - min(x)) / (max(x) - min(x))) } dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv") #factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target") #dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor) #dropout_clean_nn=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", 0, 1))) dropout_clean_nn=mutate(dropout_clean, Target = ifelse(Target == "Dropout", 0, 1)) str(dropout_clean_nn) nor = as.data.frame(lapply(dropout_clean_nn, normalize)) ## Normal 1 Layer Neural Network df.train <- sample_frac(nor, size = .85) df.test <- setdiff(nor, df.train) hid=1 nn=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=hid,threshold = .1) plot(nn) pred=predict(nn, df.test, type="response") pred=as.factor(ifelse(pred>.5, 1, 0)) print(confusionMatrix(as.factor(df.test$Target), pred)$overall[1]) ## Normal 2 Layer Neural Network df.train <- sample_frac(nor, size = .85) df.test <- setdiff(nor, df.train) hid2=4 nn2=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=c(hid2,hid2),threshold = .1) plot(nn2) pred2=predict(nn2, df.test, type="response") pred2=as.factor(ifelse(pred2>.5, 1, 0)) print(confusionMatrix(as.factor(df.test$Target), pred2)$overall[1]) ## Testing The Nodes 1st layer max_nodes=10 max_runs=10 accuracy_runs_1=data.frame(id=c(1:max_runs)) total_time <- proc.time() for (i in 0:max_nodes){ i_time=proc.time() acc_in=c() for (j in 1:max_runs){ df.train <- sample_frac(nor, size = .85) df.test <- setdiff(nor, df.train) nn=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=i, threshold = .5) pred=predict(nn, df.test, type="response") pred=as.factor(ifelse(pred>.5, 1, 0)) #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1]) acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1]) print(paste("node", i, "run", j)) } print(paste(i, "took", proc.time()[3] - i_time)[3]) accuracy_runs_1[paste(i, "nodes")] = acc_in } print(paste("whole thing took", proc.time()[3] - total_time)[3]) scores_1=data.frame(sapply(subset(accuracy_runs_1, select = -c(id)), function(x) mean(x))) sd_1=data.frame(sapply(subset(accuracy_runs_1, select = -c(id)), function(x) sd(x))) accuracy_1=data.frame(nodes=c(0:max_nodes)) accuracy_1["mean"]=scores_1 ggplot(data=accuracy_1, aes(x=nodes, y=mean))+ geom_point()+ geom_line()+ labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 1 Layer ")+ xlab("Number Of Nodes")+ ylab("Average Accuracy")+ scale_x_discrete(limits=factor(c(1:max_nodes)))+ geom_hline(yintercept = max(accuracy_1$mean), color="red")+ annotate("text",x=3,y=max(accuracy_1$mean)+.002 ,label=round(max(accuracy_1$mean),2) ,color="red") std_1=data.frame(nodes=c(0:max_nodes)) std_1["std"]=sd_1 ggplot(data=std_1, aes(x=nodes, y=std))+ geom_point()+ geom_line()+ labs(title=max_runs) ## Testing The Nodes 2nd layer max_nodes=10 max_runs=10 accuracy_runs_2=data.frame(id=c(1:max_runs)) total_time <- proc.time() for (i in 1:max_nodes){ i_time=proc.time() acc_in=c() for (j in 1:max_runs){ df.train <- sample_frac(nor, size = .85) df.test <- setdiff(nor, df.train) nn=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=c(i,i), threshold = .1) pred=predict(nn, df.test, type="response") pred=as.factor(ifelse(pred>.5, 1, 0)) #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1]) acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1]) print(paste("node", i, "run", j)) } print(paste(i, "took", proc.time()[3] - i_time)[3]) accuracy_runs_2[paste(i, "nodes")] = acc_in } print(paste("whole thing took", proc.time()[3] - total_time)[3]) scores_2=data.frame(sapply(subset(accuracy_runs_2, select = -c(id)), function(x) mean(x))) sd_2=data.frame(sapply(subset(accuracy_runs_2, select = -c(id)), function(x) sd(x))) accuracy_2=data.frame(nodes=c(1:max_nodes)) accuracy_2["mean"]=scores_2 ggplot(data=accuracy_2, aes(x=nodes, y=mean))+ geom_point()+ geom_line()+ labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 2 Layers ")+ xlab("Number Of Nodes")+ ylab("Average Accuracy")+ scale_x_discrete(limits=factor(c(1:max_nodes)))+ geom_hline(yintercept = max(accuracy_2$mean), color="red")+ annotate("text",x=3,y=max(accuracy_2$mean)+.002 ,label=round(max(accuracy_2$mean),2) ,color="red") std_2=data.frame(nodes=c(1:max_nodes)) std_2["std"]=sd_2 ggplot(data=std_2, aes(x=nodes, y=std))+ geom_point()+ geom_line()+ labs(title=max_runs) # ## Testing The Nodes 3rd layer # max_nodes=10 # max_runs=10 # accuracy_runs_3=data.frame(id=c(1:max_runs)) # # total_time <- proc.time() # # for (i in 1:max_nodes){ # i_time=proc.time() # acc_in=c() # for (j in 1:max_runs){ # df.train <- sample_frac(nor, size = .85) # df.test <- setdiff(nor, df.train) # nn=neuralnet(Target~., # data = df.train, # stepmax=1e7, # hidden=c(i,i,i), # threshold = .1) # pred=predict(nn, df.test, type="response") # pred=as.factor(ifelse(pred>.5, 1, 0)) # #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1]) # acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1]) # print(paste("node", i, "run", j)) # } # print(paste(i, "took", proc.time()[3] - i_time)[3]) # accuracy_runs_3[paste(i, "nodes")] = acc_in # } # print(paste("whole thing took", proc.time()[3] - total_time)[3]) # # scores_3=data.frame(sapply(subset(accuracy_runs_3, select = -c(id)), function(x) mean(x))) # sd_3=data.frame(sapply(subset(accuracy_runs_3, select = -c(id)), function(x) sd(x))) # # accuracy_3=data.frame(nodes=c(1:max_nodes)) # accuracy_3["mean"]=scores_3 # # ggplot(data=accuracy_3, aes(x=nodes, y=mean))+ # geom_point()+ # geom_line()+ # labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 3 Layers # ")+ # xlab("Number Of Nodes")+ # ylab("Average Accuracy")+ # scale_x_discrete(limits=factor(c(1:max_nodes))) # # std_3=data.frame(nodes=c(1:max_nodes)) # std_3["std"]=sd_3 # # ggplot(data=std_3, aes(x=nodes, y=std))+ # geom_point()+ # geom_line()+ # labs(title=max_runs)