## Neural network implementation from https://datascienceplus.com/fitting-neural-network-in-r/
# just fitting a linear model but is a good example for syntax and workflow
library("neuralnet")

data <- read.csv("train.csv", sep=",")

# First we need to check that no datapoint is missing, otherwise we need to fix the dataset.
apply(data,2,function(x) sum(is.na(x)))

# Using linear interpolation between NA values in Age 
data$Age<-na.approx(data$Age)

# Converting factors to numbers in Sex and Cabin
data$Sex<-as.numeric(data$Sex)
data$Cabin<-as.numeric(data$Cabin)

# subsetting to use only Survived and Pclass, Sex, Age, SibSp, Parch, Fare, Cabin
data <- data[,c(2,3,5:8,10,11)]

# use 80% for training and 20% for test, sampled randomly from 'data'
# index stores random numbers from 1 through end which will be used for training
index <- sample(1:nrow(data),round(0.80*nrow(data)))
train <- data[index,]
test <- data[-index,]

# method of normalization used is the min-max scaling technique [0,1]
maxs <- apply(data, 2, max) 
mins <- apply(data, 2, min)
# scaling the data for each column and fitting the scaled values into the data frame
scaled <- as.data.frame(scale(data, center = mins, scale = maxs - mins))
# applying the scaled transform on the training and testing indices
train_ <- scaled[index,]
test_ <- scaled[-index,]

n <- names(train_)
# cannot directly embed the formula Survived~ into the neuralnet command, so define it separately
# basically string concatenating with all the column headers
f <- as.formula(paste("Survived ~", paste(n[!n %in% "Survived"], collapse = " + ")))
# train nn (calculate yields with 2 layers of 5 and 3 hidden units), use T for regression and F for classification
nn <- neuralnet(f,data=train_,hidden=c(4),linear.output=F,act.fct="tanh",lifesign="minimal",stepmax=2e6)

plot(nn, rep="best")

# finding the normalized test results for regression
pr.nn <- compute(nn,test_[,2:ncol(data)]) # since the first column contains our predictor variable 'yield' 
# Using a strict threshold of 0.5 and greater for logistic values of 0 & 1
nn.res <- ifelse(pr.nn$net.result>=0.5, 1, 0)

# comparing the two values and printing out a success percentage
MSE.nn <- sum(abs(test_$Survived - nn.res))/nrow(test_)

#print(paste(MSE.lm,MSE.nn))
print(paste("MSE for neural network: ",MSE.nn))