library(tidyverse)
library(caret)
library(neuralnet)

normalize = function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

dropout_clean <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv")

#factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target")
#dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor)
#dropout_clean_nn=mutate(dropout_clean, Target = factor(ifelse(Target == "Dropout", 0, 1)))
dropout_clean_nn=mutate(dropout_clean, Target = ifelse(Target == "Dropout", 0, 1))

str(dropout_clean_nn)

nor = as.data.frame(lapply(dropout_clean_nn, normalize))


## Normal 1 Layer Neural Network
df.train <- sample_frac(nor, size = .85)
df.test <- setdiff(nor, df.train)
hid=1

nn=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=hid,threshold = .1)
plot(nn)
pred=predict(nn, df.test, type="response")
pred=as.factor(ifelse(pred>.5, 1, 0))
print(confusionMatrix(as.factor(df.test$Target), pred)$overall[1])


## Normal 2 Layer Neural Network
df.train <- sample_frac(nor, size = .85)
df.test <- setdiff(nor, df.train)
hid2=4

nn2=neuralnet(Target~., data = df.train, stepmax=1e7, hidden=c(hid2,hid2),threshold = .1)
plot(nn2)
pred2=predict(nn2, df.test, type="response")
pred2=as.factor(ifelse(pred2>.5, 1, 0))
print(confusionMatrix(as.factor(df.test$Target), pred2)$overall[1])


## Testing The Nodes 1st layer
max_nodes=10
max_runs=10
accuracy_runs_1=data.frame(id=c(1:max_runs))

total_time <- proc.time()

for (i in 0:max_nodes){
  i_time=proc.time()
  acc_in=c()
  for (j in 1:max_runs){
    df.train <- sample_frac(nor, size = .85)
    df.test <- setdiff(nor, df.train)
    nn=neuralnet(Target~.,
                 data = df.train, 
                 stepmax=1e7,
                 hidden=i,
                 threshold = .5)
    pred=predict(nn, df.test, type="response")
    pred=as.factor(ifelse(pred>.5, 1, 0))
    #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1])
    acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1])
    print(paste("node", i, "run", j))
  }
  print(paste(i, "took", proc.time()[3] - i_time)[3])
  accuracy_runs_1[paste(i, "nodes")] = acc_in
}
print(paste("whole thing took", proc.time()[3] - total_time)[3])

scores_1=data.frame(sapply(subset(accuracy_runs_1, select = -c(id)), function(x) mean(x)))
sd_1=data.frame(sapply(subset(accuracy_runs_1, select = -c(id)), function(x) sd(x)))

accuracy_1=data.frame(nodes=c(0:max_nodes))
accuracy_1["mean"]=scores_1

ggplot(data=accuracy_1, aes(x=nodes, y=mean))+
  geom_point()+
  geom_line()+
  labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 1 Layer
       ")+
  xlab("Number Of Nodes")+
  ylab("Average Accuracy")+
  scale_x_discrete(limits=factor(c(1:max_nodes)))+
  geom_hline(yintercept = max(accuracy_1$mean), color="red")+
  annotate("text",x=3,y=max(accuracy_1$mean)+.002
           ,label=round(max(accuracy_1$mean),2)
           ,color="red")


std_1=data.frame(nodes=c(0:max_nodes))
std_1["std"]=sd_1

ggplot(data=std_1, aes(x=nodes, y=std))+
  geom_point()+
  geom_line()+
  labs(title=max_runs)


## Testing The Nodes 2nd layer
max_nodes=10
max_runs=10
accuracy_runs_2=data.frame(id=c(1:max_runs))

total_time <- proc.time()

for (i in 1:max_nodes){
  i_time=proc.time()
  acc_in=c()
  for (j in 1:max_runs){
    df.train <- sample_frac(nor, size = .85)
    df.test <- setdiff(nor, df.train)
    nn=neuralnet(Target~.,
                 data = df.train, 
                 stepmax=1e7,
                 hidden=c(i,i),
                 threshold = .1)
    pred=predict(nn, df.test, type="response")
    pred=as.factor(ifelse(pred>.5, 1, 0))
    #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1])
    acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1])
    print(paste("node", i, "run", j))
  }
  print(paste(i, "took", proc.time()[3] - i_time)[3])
  accuracy_runs_2[paste(i, "nodes")] = acc_in
}
print(paste("whole thing took", proc.time()[3] - total_time)[3])

scores_2=data.frame(sapply(subset(accuracy_runs_2, select = -c(id)), function(x) mean(x)))
sd_2=data.frame(sapply(subset(accuracy_runs_2, select = -c(id)), function(x) sd(x)))

accuracy_2=data.frame(nodes=c(1:max_nodes))
accuracy_2["mean"]=scores_2

ggplot(data=accuracy_2, aes(x=nodes, y=mean))+
  geom_point()+
  geom_line()+
  labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 2 Layers
       ")+
  xlab("Number Of Nodes")+
  ylab("Average Accuracy")+
  scale_x_discrete(limits=factor(c(1:max_nodes)))+
  geom_hline(yintercept = max(accuracy_2$mean), color="red")+
  annotate("text",x=3,y=max(accuracy_2$mean)+.002
           ,label=round(max(accuracy_2$mean),2)
           ,color="red")

std_2=data.frame(nodes=c(1:max_nodes))
std_2["std"]=sd_2

ggplot(data=std_2, aes(x=nodes, y=std))+
  geom_point()+
  geom_line()+
  labs(title=max_runs)


# ## Testing The Nodes 3rd layer
# max_nodes=10
# max_runs=10
# accuracy_runs_3=data.frame(id=c(1:max_runs))
# 
# total_time <- proc.time()
# 
# for (i in 1:max_nodes){
#   i_time=proc.time()
#   acc_in=c()
#   for (j in 1:max_runs){
#     df.train <- sample_frac(nor, size = .85)
#     df.test <- setdiff(nor, df.train)
#     nn=neuralnet(Target~.,
#                  data = df.train, 
#                  stepmax=1e7,
#                  hidden=c(i,i,i),
#                  threshold = .1)
#     pred=predict(nn, df.test, type="response")
#     pred=as.factor(ifelse(pred>.5, 1, 0))
#     #print(confusionMatrix(as.factor(df.test$Survived), pred)$overall[1])
#     acc_in=c(acc_in,confusionMatrix(as.factor(df.test$Target), pred)$overall[1])
#     print(paste("node", i, "run", j))
#   }
#   print(paste(i, "took", proc.time()[3] - i_time)[3])
#   accuracy_runs_3[paste(i, "nodes")] = acc_in
# }
# print(paste("whole thing took", proc.time()[3] - total_time)[3])
# 
# scores_3=data.frame(sapply(subset(accuracy_runs_3, select = -c(id)), function(x) mean(x)))
# sd_3=data.frame(sapply(subset(accuracy_runs_3, select = -c(id)), function(x) sd(x)))
# 
# accuracy_3=data.frame(nodes=c(1:max_nodes))
# accuracy_3["mean"]=scores_3
# 
# ggplot(data=accuracy_3, aes(x=nodes, y=mean))+
#   geom_point()+
#   geom_line()+
#   labs(title="Average Accuracy Over 10 Runs For Different Number Of Nodes. 3 Layers
#        ")+
#   xlab("Number Of Nodes")+
#   ylab("Average Accuracy")+
#   scale_x_discrete(limits=factor(c(1:max_nodes)))
# 
# std_3=data.frame(nodes=c(1:max_nodes))
# std_3["std"]=sd_3
# 
# ggplot(data=std_3, aes(x=nodes, y=std))+
#   geom_point()+
#   geom_line()+
#   labs(title=max_runs)