Libraries

library(tidyverse)
library(VSURF)
library(party)
library(caret)

Load in the data

titanic <- read_csv("data/train.csv")

Data manipulation

Titles

During my exploratory analysis it looked like there might be some important information encoded in the Name column, which might be predictive, so I took a little bit of time to extract the different “titles” associated with individual names.

Note: This is super quick and dirty. Please don’t judge me.

#Initial split in the names.
titanic$title <- str_split(titanic$Name, ", ", simplify = TRUE)[, 2] %>%
  str_split(., " ", simplify = TRUE) %>%
  .[, 1]
# There are many "one-off" categories. Reduce them based on groupings:
#   Redundant categories.
titanic$title <- if_else(titanic$title %in% c("Ms.", "Mlle."), "Miss.", titanic$title)
titanic$title <- if_else(titanic$title %in% c("Mme."), "Miss.", titanic$title)
#   Military categories.
titanic$title <- if_else(titanic$title %in% c("Capt.", "Col.", "Major."),
  "Military", titanic$title)
#   Nobility/Other categories. Lump these together to be general, since I won't
#     know the categories in the test dataset on Kaggle.
titanic$title <- if_else(!(titanic$title %in% c("Dr.", "Master.", "Military",
  "Miss.", "Mr.", "Mrs.", "Rev.")), "Nobility", titanic$title)
#Check that the categories make sense.
titanic %>%
  count(title)
## # A tibble: 8 x 2
##   title        n
##   <chr>    <int>
## 1 Dr.          7
## 2 Master.     40
## 3 Military     5
## 4 Miss.      186
## 5 Mr.        517
## 6 Mrs.       125
## 7 Nobility     5
## 8 Rev.         6

Price per cabin

I think it makes more sense to know the price/cabin, rather than the entire price paid.

#Determine the price per cabin, where available.
# Exract the number of cabins purchased.
titanic$numCabin <- str_split(titanic$Cabin, pattern = " ") %>%
  map(function(x) sum(!is.na(x))) %>%
  unlist() %>%
  ifelse(is.na(titanic$Cabin), NA, .)
titanic$pricePerCabin <- titanic$Fare / titanic$numCabin

Last bit of tidying before applying the random forests.

#Factor conversion
#Convert data to factors, where appropriate.
titanic <- titanic %>%
  mutate_at(vars(Sex, Embarked, title), factor)
#Extract only the columns with relevant information.
titanicCore <- titanic %>% dplyr::select(-PassengerId, -Name, -Ticket, -Cabin,
  -numCabin)
#Convert the response to a factor to ensure the random forest algorithm is applying the classification (rather than regression) form of the problem.
titanicCore$Survived <- factor(titanicCore$Survived,
  labels = c("Perished", "Survived"))

Random forest!

Split the data

Before I split the data, I added a randome variable to the mix as a sanity check, to make sure the algorithms being tested were actually extracting important information.

#Add a column with random data in it as a sanity check.
set.seed(201805021)
titanicCore$berries <- sample(c("blueberries", "strawberries", "snozberries"),
  size = nrow(titanicCore), replace = TRUE)
titanicCore$berries <- factor(titanicCore$berries)

Generate training and testing data sets.

#Generate training and test datasets.
set.seed(201805022)
#Use an index to extract 80% of the data for the training set.
trainIndex <- sample(1:nrow(titanicCore), round(nrow(titanicCore) * 0.8))
train <- titanicCore[trainIndex, ]
#The remaining 20%.
test <- titanicCore[-trainIndex, ]

“Total” data model using cforest()

I’ve been using the party package for my random forest needs. It seems a bit more flexible that randomForest, but also appears to be a slightly older (but more stable?) version of partykit.

#First pass at a making the random forest using all the data.
trainRF <- cforest(Survived ~ ., data = train,
  controls = cforest_unbiased(ntree = 1000))

Confusion matrix

Turns out that the caret package has a nice little function for classification analysis, which allows one to compare observed and predicted classes. Below is an example of the output. I’m going to save those outputs for a larger comparison later.

#Create a list to store the results of the confusion matrix.
CMnames <- c("Training", "Testing")
CM <- vector("list", length(CMnames))
names(CM) <- CMnames
#Generate a data frame with the observed and predicted data.
trainTable <- data.frame(obs = train$Survived, pred = predict(trainRF))
#Confusion matrix for training set.
CM$Training[["All variables"]] <- confusionMatrix(trainTable$obs, trainTable$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTable <- data.frame(obs = test$Survived,
  pred = predict(trainRF, newdata = test))
#Confusion matrix for the cross-validation set.
cMAll <- confusionMatrix(testTable$obs, testTable$pred, positive = "Survived")
cMAll
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Perished Survived
##   Perished      106       10
##   Survived       20       42
##                                           
##                Accuracy : 0.8315          
##                  95% CI : (0.7682, 0.8833)
##     No Information Rate : 0.7079          
##     P-Value [Acc > NIR] : 0.0001002       
##                                           
##                   Kappa : 0.6143          
##  Mcnemar's Test P-Value : 0.1003482       
##                                           
##             Sensitivity : 0.8077          
##             Specificity : 0.8413          
##          Pos Pred Value : 0.6774          
##          Neg Pred Value : 0.9138          
##              Prevalence : 0.2921          
##          Detection Rate : 0.2360          
##    Detection Prevalence : 0.3483          
##       Balanced Accuracy : 0.8245          
##                                           
##        'Positive' Class : Survived        
## 
#Save for later.
CM$Testing[["All variables"]] <- confusionMatrix(testTable$obs,
  testTable$pred, positive = "Survived")

Generation of parsimonious models using party and VSURF

party

Below is the implementation party’s variable importance method.

#Checking for parsimony.
#Which variables are most important, according to `party`?
impVarP <- varimp(trainRF)
# Convert to data frame.
impVarPDF <- as.data.frame(impVarP) %>%
  rownames_to_column("Parameter") %>%
  arrange(impVarP)
# Change the ordering using factors based on 
impVarPDF$Parameter <- factor(impVarPDF$Parameter,
  levels = impVarPDF$Parameter[order(impVarPDF$impVarP, decreasing = TRUE)])
#Extract variables most important, relative to an "importance threshold" of
# 0.05. This value is still a bit nebulous to me.
impNamesP <- impVarPDF %>%
  dplyr::filter(impVarP >= 0.05) %>%
  dplyr::select(Parameter) %>%
  mutate(Parameter = as.character(Parameter)) %>%
  .$Parameter

What do the data look like?

#Plot the data
impVarPDF %>%
  ggplot(aes(Parameter, impVarP)) +
  geom_point(size = 4) +
  geom_hline(yintercept = 0.05, color = "red") +
  theme_bw() +
  labs(x = NULL,
    y = "Relative importance") +
  ggtitle("Variables of importance from 'party'")

So, it looks like the variable I created, “title”, using the Name column is generating some utility, which is nice. “Sex” and “Pclass” also meet the threshold. It also appears that the random variable, “berries” is a really poor predictor of survival, which is good, since that shouldn’t have any impact on outcome.

VSURF

The output from VSURF is a bit more complicated, so I had to create a little function to extract the data I was most interested in.

#Which variables are most important according to `VSURF`?
impVarV <- VSURF(Survived ~ ., data = train, nfor.thres = 1000,
  nfor.interp = 1000, nfor.pred = 1000, parallel = TRUE, ncores = 6,
  na.action = na.omit)

#A function for extracting the desired information for a VSURF object.
impExtraction <- function(vObj) {
  #Extract all the names from the input data.
  allNames <- names(attr(vObj$terms,"dataClasses"))
  #Remove the first value, which is the response.
  allNames <- allNames[2:length(allNames)]
  #Index the "interpretation" variables.
  interpNames <- allNames[vObj$varselect.interp]
  #Index the "prediction" variables.
  predNames <- allNames[vObj$varselect.pred]
  #Extract the importance index.
  vImp <- vObj$imp.mean.dec[vObj$imp.mean.dec.ind]
  #Create the base data frame for the "threshold" variables.
  allDF <- data.frame(Parameter = allNames, VImean = vImp)
  #Make a "Step" column for all variables. The default value is from the
  # "Threshold" part of the output.
  allDF$Step <- "Threshold"
  allDF <- allDF %>%
    #If Step contains a name from the interp step, then interp overwrites thres.
    mutate(Step = ifelse(Parameter %in% interpNames, "Interpretation", Step),
      #If Step contains a name from pred, then pred overwrites interp.
      Step = ifelse(Parameter %in% predNames, "Prediction", Step),
      #Reorder the Parameters basesd on the relative importance value.
      Parameter = factor(Parameter, levels = Parameter[order(VImean,
      decreasing = TRUE)]),
      Step = factor(Step, levels = c("Threshold", "Interpretation",
        "Prediction")))
  allDF
}

#Extract the data.
impVarVDF <- impExtraction(impVarV)

Now let’s plot the data.

#Plot the data
impVarVDF %>%
  ggplot(aes(Parameter, VImean, color = Step)) +
  geom_point(size = 4) +
  labs(x = NULL,
    y = "Relative importance") +
  ggtitle("Variables of importance from 'VSURF'") +
  theme_bw() +
  scale_color_brewer(palette = "Set1")

VSURF uses a three step process for identifying important variables. The first step, “Threshold” looks at all the variables. “Interpretation” points out the variables that generate the highest accuracy models. “Prediction” extracts the variables that are most consistently important during the cross-validation phase.

In this case, “title” and “Sex” both make it the the Interpretation step, while only “title” makes it to the prediction phase. “berries” is at the bottom again, so that’s good.

Below, I apply both models for the Interpretation and Prediction steps and extract their confusion classes for the training and testing data sets.

#Apply the parsimonious model from `party`.
trainP <- train[, c("Survived", impNamesP)]
trainPRF <- cforest(Survived ~ ., data = trainP,
  controls = cforest_unbiased(ntree = 1000))
trainTableP <- data.frame(obs = trainP$Survived, pred = predict(trainPRF))
CM$Training[["`party` variables"]] <- confusionMatrix(trainTableP$obs,
  trainTableP$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableP <- data.frame(obs = test$Survived,
  pred = predict(trainPRF, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`party` variables"]] <- confusionMatrix(testTableP$obs,
  testTableP$pred, positive = "Survived")

#Apply the parsimonious model from `VSURF`.
#Column names associated with the Interpretation and Prediction steps.
impNamesVInt <- impVarVDF %>%
  dplyr::filter((Step == "Interpretation" | Step == "Prediction")) %>%
  mutate(Parameter = as.character(Parameter)) %>%
  .$Parameter
#Column names of just the Prediction step.
impNamesVPre <- impVarVDF %>%
  dplyr::filter(Step == "Prediction") %>%
  mutate(Parameter = as.character(Parameter)) %>%
  .$Parameter

trainVI <- train[, c("Survived", impNamesVInt)]
trainVRFI <- cforest(Survived ~ ., data = trainVI,
  controls = cforest_unbiased(ntree = 1000))
trainTableVI <- data.frame(obs = trainVI$Survived, pred = predict(trainVRFI))
CM$Training[["`VSURF` Interp variables"]] <- confusionMatrix(trainTableVI$obs,
  trainTableVI$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableVI <- data.frame(obs = test$Survived,
  pred = predict(trainVRFI, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`VSURF` Interp variables"]] <- confusionMatrix(testTableVI$obs,
  testTableVI$pred, positive = "Survived")

trainVP <- train[, c("Survived", impNamesVPre)]
trainVRFP <- cforest(Survived ~ ., data = trainVP,
  controls = cforest_unbiased(ntree = 1000))
trainTableVP <- data.frame(obs = trainVP$Survived, pred = predict(trainVRFP))
CM$Training[["`VSURF` Pred variables"]] <- confusionMatrix(trainTableVP$obs,
  trainTableVP$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableVP <- data.frame(obs = test$Survived,
  pred = predict(trainVRFP, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`VSURF` Pred variables"]] <- confusionMatrix(testTableVP$obs,
  testTableVP$pred, positive = "Survived")

Model comparison

I think it necessary to compare all the models that were just generated to understand how parsimony might have influenced our overall accuracy.

The CMExtraction function is a convenience function for extracting the information in the CM object I created. It works pretty well with the map family of functions - a replacement to the apply family of functions, but with a supposedly more consistent argument structure and output class.

#This function extracts some key summary data for all the 
CMExtraction <- function(listObj) {
  #A list to house the outputs of the for loop.
  outList <- list()
  for(i in 1:length(listObj)) {
    #The name of the object at the list level.
    scenName <- names(listObj[i])
    #Extract the underlying list.
    outObj <- listObj[[i]]
    #Extract the mean accuracy level.
    Acc <- outObj$overall["Accuracy"]
    #The upper 95% CI.
    AccUp <- outObj$overall["AccuracyUpper"]
    #The lower 95% CI.
    AccLo <- outObj$overall["AccuracyLower"]
    #A data frame to house all the data.
    # Note: used `data_frame` because I'm lazy and didn't want to specify string
    #   rules.
    df <- data_frame(name = scenName,
      accuracy = Acc,
      accuracyUp = AccUp,
      accuracyLo = AccLo)
    #Get rid of any row names.
    rownames(df) <- NULL
    #Put the data frame into the output list.
    outList[[i]] <- df
  }
  #Row bind all the data frames.
  out <- purrr::map_df(outList, bind_rows)
  #Return the output.
  out
}

CMSummary <- purrr::map_df(CM, CMExtraction, .id = "id")

The final plot.

cP <- pals::kovesi.rainbow(12)[c(2,10)]
names(cP) <- NULL
CMSummary %>%
  mutate(id = factor(id, levels = (c("Training", "Testing")))) %>%
  ggplot(aes(name, accuracy, color = id)) +
  geom_errorbar(aes(ymin = accuracyLo, ymax = accuracyUp),
    position = position_dodge(width = 0.9), width = 0.3, show.legend = FALSE) +
  geom_point(size = 4, position = position_dodge(width = 0.9)) +
  labs(x = NULL,
    y = "Accuracy") +
  ggtitle("Random forest model comparisons: Training and testing") +
  theme_bw() +
  theme(legend.title = element_blank(),
    axis.text.x = element_text(angle = 10, hjust = 1)) +
  scale_y_continuous(limits = c(0.6, 1), expand = c(0, 0)) +
  scale_color_manual(values = cP)

Not surprisingly, it appears that including as much data as possible generates the most predictive results. However, there is a surprising amount of overlap between the most and least parsimonious models, “`VSURF` Pred variables” and “All variables”, respectively, implying someone’s title, and therefore social status, had a lot of influence on whether or not they survived the sinking of the Titanic.