## Neural network implementation from https://datascienceplus.com/fitting-neural-network-in-r/ # just fitting a linear model but is a good example for syntax and workflow library("neuralnet") data <- read.csv("train.csv", sep=",") # First we need to check that no datapoint is missing, otherwise we need to fix the dataset. apply(data,2,function(x) sum(is.na(x))) # Using linear interpolation between NA values in Age data$Age<-na.approx(data$Age) # Converting factors to numbers in Sex and Cabin data$Sex<-as.numeric(data$Sex) data$Cabin<-as.numeric(data$Cabin) # subsetting to use only Survived and Pclass, Sex, Age, SibSp, Parch, Fare, Cabin data <- data[,c(2,3,5:8,10,11)] # use 80% for training and 20% for test, sampled randomly from 'data' # index stores random numbers from 1 through end which will be used for training index <- sample(1:nrow(data),round(0.80*nrow(data))) train <- data[index,] test <- data[-index,] # method of normalization used is the min-max scaling technique [0,1] maxs <- apply(data, 2, max) mins <- apply(data, 2, min) # scaling the data for each column and fitting the scaled values into the data frame scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins)) # applying the scaled transform on the training and testing indices train_ <- scaled[index,] test_ <- scaled[-index,] n <- names(train_) # cannot directly embed the formula Survived~ into the neuralnet command, so define it separately # basically string concatenating with all the column headers f <- as.formula(paste("Survived ~", paste(n[!n %in% "Survived"], collapse = " + "))) # train nn (calculate yields with 2 layers of 5 and 3 hidden units), use T for regression and F for classification nn <- neuralnet(f,data=train_,hidden=c(4),linear.output=F,act.fct="tanh",lifesign="minimal",stepmax=2e6) plot(nn, rep="best") # finding the normalized test results for regression pr.nn <- compute(nn,test_[,2:ncol(data)]) # since the first column contains our predictor variable 'yield' # Using a strict threshold of 0.5 and greater for logistic values of 0 & 1 nn.res <- ifelse(pr.nn$net.result>=0.5, 1, 0) # comparing the two values and printing out a success percentage MSE.nn <- sum(abs(test_$Survived - nn.res))/nrow(test_) #print(paste(MSE.lm,MSE.nn)) print(paste("MSE for neural network: ",MSE.nn))