Im_getting_a_fucking_masters/Dropout Data Cleaning.R

142 lines
12 KiB
R
Raw Normal View History

2024-06-15 21:57:46 -05:00
library(tidyverse)
library(caret)
dropout <- read.csv("C:/Users/Mark/Desktop/Grad School/PDAT630/dropout.csv")
dropout_clean=dropout
colnames(dropout_clean)[1]='Marital.status'
## Combining under represented factors into buckets of at least 10
table(dropout_clean$Marital.status)
table(dropout_clean$Application.mode)
table(dropout_clean$Previous.qualification)
table(dropout_clean$Mother.s.qualification)
table(dropout_clean$Father.s.qualification)
table(dropout_clean$Mother.s.occupation)
table(dropout_clean$Father.s.occupation)
dropout_clean=mutate(dropout_clean, Marital.status = ifelse((Marital.status == "3"|
Marital.status == "6"), 7, Marital.status))
dropout_clean=mutate(dropout_clean, Application.mode = ifelse((Application.mode == "2"|
Application.mode =="5"|
Application.mode == "10"|
Application.mode == "11"|
Application.mode == "18"), 19, Application.mode))
dropout_clean=mutate(dropout_clean, Previous.qualification = ifelse((Previous.qualification == "4"|
Previous.qualification == "5"|
Previous.qualification == "8"|
Previous.qualification == "10"|
Previous.qualification == "11"|
Previous.qualification == "13"|
Previous.qualification == "17"), 18, Previous.qualification))
dropout_clean=mutate(dropout_clean, Mother.s.qualification = ifelse((Mother.s.qualification == "6"|
Mother.s.qualification == "7"|
Mother.s.qualification == "8"|
Mother.s.qualification == "9"|
Mother.s.qualification == "11"|
Mother.s.qualification == "12"|
Mother.s.qualification == "14"|
Mother.s.qualification == "15"|
Mother.s.qualification == "16"|
Mother.s.qualification == "17"|
Mother.s.qualification == "18"|
Mother.s.qualification == "20"|
Mother.s.qualification == "21"|
Mother.s.qualification == "24"|
Mother.s.qualification == "25"|
Mother.s.qualification == "26"|
Mother.s.qualification == "27"|
Mother.s.qualification == "28"|
Mother.s.qualification == "29"), 35, Mother.s.qualification))
dropout_clean=mutate(dropout_clean, Father.s.qualification = ifelse((Father.s.qualification == "6"|
Father.s.qualification == "7"|
Father.s.qualification == "8"|
Father.s.qualification == "11"|
Father.s.qualification == "12"|
Father.s.qualification == "13"|
Father.s.qualification == "15"|
Father.s.qualification == "16"|
Father.s.qualification == "17"|
Father.s.qualification == "18"|
Father.s.qualification == "19"|
Father.s.qualification == "20"|
Father.s.qualification == "21"|
Father.s.qualification == "22"|
Father.s.qualification == "23"|
Father.s.qualification == "25"|
Father.s.qualification == "26"|
Father.s.qualification == "30"|
Father.s.qualification == "31"|
Father.s.qualification == "32"|
Father.s.qualification == "33"|
Father.s.qualification == "34"), 35, Father.s.qualification))
dropout_clean=mutate(dropout_clean, Mother.s.occupation = ifelse((Mother.s.occupation == "11"|
Mother.s.occupation == "14"|
Mother.s.occupation == "15"|
Mother.s.occupation == "16"|
Mother.s.occupation == "17"|
Mother.s.occupation == "18"|
Mother.s.occupation == "19"|
Mother.s.occupation == "20"|
Mother.s.occupation == "21"|
Mother.s.occupation == "22"|
Mother.s.occupation == "23"|
Mother.s.occupation == "24"|
Mother.s.occupation == "25"|
Mother.s.occupation == "26"|
Mother.s.occupation == "27"|
Mother.s.occupation == "28"|
Mother.s.occupation == "30"|
Mother.s.occupation == "31"), 47, Mother.s.occupation))
dropout_clean=mutate(dropout_clean, Father.s.occupation = ifelse((Father.s.occupation == "14"|
Father.s.occupation == "15"|
Father.s.occupation == "16"|
Father.s.occupation == "17"|
Father.s.occupation == "18"|
Father.s.occupation == "19"|
Father.s.occupation == "20"|
Father.s.occupation == "21"|
Father.s.occupation == "22"|
Father.s.occupation == "23"|
Father.s.occupation == "24"|
Father.s.occupation == "25"|
Father.s.occupation == "26"|
Father.s.occupation == "27"|
Father.s.occupation == "28"|
Father.s.occupation == "29"|
Father.s.occupation == "30"|
Father.s.occupation == "31"|
Father.s.occupation == "32"|
Father.s.occupation == "33"|
Father.s.occupation == "34"|
Father.s.occupation == "35"|
Father.s.occupation == "36"|
Father.s.occupation == "37"|
Father.s.occupation == "38"|
Father.s.occupation == "39"|
Father.s.occupation == "40"|
Father.s.occupation == "41"|
Father.s.occupation == "42"|
Father.s.occupation == "43"|
Father.s.occupation == "45"|
Father.s.occupation == "46"), 47, Father.s.occupation))
#factor_cols=c("Marital.status","Scholarship.holder","Tuition.fees.up.to.date","Gender","Displaced","Daytime.evening.attendance","Application.mode", "Course","Previous.qualification","Mother.s.qualification","Father.s.qualification","Mother.s.occupation","Father.s.occupation","International","Target")
#dropout_clean[factor_cols] <- lapply(dropout_clean[factor_cols], factor)
#sapply(dropout_clean, table)
dropout_clean=subset(dropout_clean, select = c(Gender,Displaced,Tuition.fees.up.to.date,
Scholarship.holder,Marital.status, Daytime.evening.attendance, Course,
Previous.qualification, Mother.s.qualification,Father.s.qualification,
Mother.s.occupation, Father.s.occupation, Age.at.enrollment,
International, Target))
#str(dropout_clean)
write.csv(dropout_clean, "C:/Users/Mark/Desktop/Grad School/PDAT630/dropout_clean.csv", row.names=FALSE)