library(tidyverse)
library(VSURF)
library(party)
library(caret)
titanic <- read_csv("data/train.csv")
During my exploratory analysis it looked like there might be some important information encoded in the Name column, which might be predictive, so I took a little bit of time to extract the different “titles” associated with individual names.
Note: This is super quick and dirty. Please don’t judge me.
#Initial split in the names.
titanic$title <- str_split(titanic$Name, ", ", simplify = TRUE)[, 2] %>%
str_split(., " ", simplify = TRUE) %>%
.[, 1]
# There are many "one-off" categories. Reduce them based on groupings:
# Redundant categories.
titanic$title <- if_else(titanic$title %in% c("Ms.", "Mlle."), "Miss.", titanic$title)
titanic$title <- if_else(titanic$title %in% c("Mme."), "Miss.", titanic$title)
# Military categories.
titanic$title <- if_else(titanic$title %in% c("Capt.", "Col.", "Major."),
"Military", titanic$title)
# Nobility/Other categories. Lump these together to be general, since I won't
# know the categories in the test dataset on Kaggle.
titanic$title <- if_else(!(titanic$title %in% c("Dr.", "Master.", "Military",
"Miss.", "Mr.", "Mrs.", "Rev.")), "Nobility", titanic$title)
#Check that the categories make sense.
titanic %>%
count(title)
## # A tibble: 8 x 2
## title n
## <chr> <int>
## 1 Dr. 7
## 2 Master. 40
## 3 Military 5
## 4 Miss. 186
## 5 Mr. 517
## 6 Mrs. 125
## 7 Nobility 5
## 8 Rev. 6
I think it makes more sense to know the price/cabin, rather than the entire price paid.
#Determine the price per cabin, where available.
# Exract the number of cabins purchased.
titanic$numCabin <- str_split(titanic$Cabin, pattern = " ") %>%
map(function(x) sum(!is.na(x))) %>%
unlist() %>%
ifelse(is.na(titanic$Cabin), NA, .)
titanic$pricePerCabin <- titanic$Fare / titanic$numCabin
#Factor conversion
#Convert data to factors, where appropriate.
titanic <- titanic %>%
mutate_at(vars(Sex, Embarked, title), factor)
#Extract only the columns with relevant information.
titanicCore <- titanic %>% dplyr::select(-PassengerId, -Name, -Ticket, -Cabin,
-numCabin)
#Convert the response to a factor to ensure the random forest algorithm is applying the classification (rather than regression) form of the problem.
titanicCore$Survived <- factor(titanicCore$Survived,
labels = c("Perished", "Survived"))
Before I split the data, I added a randome variable to the mix as a sanity check, to make sure the algorithms being tested were actually extracting important information.
#Add a column with random data in it as a sanity check.
set.seed(201805021)
titanicCore$berries <- sample(c("blueberries", "strawberries", "snozberries"),
size = nrow(titanicCore), replace = TRUE)
titanicCore$berries <- factor(titanicCore$berries)
Generate training and testing data sets.
#Generate training and test datasets.
set.seed(201805022)
#Use an index to extract 80% of the data for the training set.
trainIndex <- sample(1:nrow(titanicCore), round(nrow(titanicCore) * 0.8))
train <- titanicCore[trainIndex, ]
#The remaining 20%.
test <- titanicCore[-trainIndex, ]
cforest()
I’ve been using the party
package for my random forest needs. It seems a bit more flexible that randomForest
, but also appears to be a slightly older (but more stable?) version of partykit
.
#First pass at a making the random forest using all the data.
trainRF <- cforest(Survived ~ ., data = train,
controls = cforest_unbiased(ntree = 1000))
Turns out that the caret
package has a nice little function for classification analysis, which allows one to compare observed and predicted classes. Below is an example of the output. I’m going to save those outputs for a larger comparison later.
#Create a list to store the results of the confusion matrix.
CMnames <- c("Training", "Testing")
CM <- vector("list", length(CMnames))
names(CM) <- CMnames
#Generate a data frame with the observed and predicted data.
trainTable <- data.frame(obs = train$Survived, pred = predict(trainRF))
#Confusion matrix for training set.
CM$Training[["All variables"]] <- confusionMatrix(trainTable$obs, trainTable$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTable <- data.frame(obs = test$Survived,
pred = predict(trainRF, newdata = test))
#Confusion matrix for the cross-validation set.
cMAll <- confusionMatrix(testTable$obs, testTable$pred, positive = "Survived")
cMAll
## Confusion Matrix and Statistics
##
## Reference
## Prediction Perished Survived
## Perished 106 10
## Survived 20 42
##
## Accuracy : 0.8315
## 95% CI : (0.7682, 0.8833)
## No Information Rate : 0.7079
## P-Value [Acc > NIR] : 0.0001002
##
## Kappa : 0.6143
## Mcnemar's Test P-Value : 0.1003482
##
## Sensitivity : 0.8077
## Specificity : 0.8413
## Pos Pred Value : 0.6774
## Neg Pred Value : 0.9138
## Prevalence : 0.2921
## Detection Rate : 0.2360
## Detection Prevalence : 0.3483
## Balanced Accuracy : 0.8245
##
## 'Positive' Class : Survived
##
#Save for later.
CM$Testing[["All variables"]] <- confusionMatrix(testTable$obs,
testTable$pred, positive = "Survived")
party
and VSURF
party
Below is the implementation party
’s variable importance method.
#Checking for parsimony.
#Which variables are most important, according to `party`?
impVarP <- varimp(trainRF)
# Convert to data frame.
impVarPDF <- as.data.frame(impVarP) %>%
rownames_to_column("Parameter") %>%
arrange(impVarP)
# Change the ordering using factors based on
impVarPDF$Parameter <- factor(impVarPDF$Parameter,
levels = impVarPDF$Parameter[order(impVarPDF$impVarP, decreasing = TRUE)])
#Extract variables most important, relative to an "importance threshold" of
# 0.05. This value is still a bit nebulous to me.
impNamesP <- impVarPDF %>%
dplyr::filter(impVarP >= 0.05) %>%
dplyr::select(Parameter) %>%
mutate(Parameter = as.character(Parameter)) %>%
.$Parameter
What do the data look like?
#Plot the data
impVarPDF %>%
ggplot(aes(Parameter, impVarP)) +
geom_point(size = 4) +
geom_hline(yintercept = 0.05, color = "red") +
theme_bw() +
labs(x = NULL,
y = "Relative importance") +
ggtitle("Variables of importance from 'party'")
So, it looks like the variable I created, “title”, using the Name column is generating some utility, which is nice. “Sex” and “Pclass” also meet the threshold. It also appears that the random variable, “berries” is a really poor predictor of survival, which is good, since that shouldn’t have any impact on outcome.
VSURF
The output from VSURF
is a bit more complicated, so I had to create a little function to extract the data I was most interested in.
#Which variables are most important according to `VSURF`?
impVarV <- VSURF(Survived ~ ., data = train, nfor.thres = 1000,
nfor.interp = 1000, nfor.pred = 1000, parallel = TRUE, ncores = 6,
na.action = na.omit)
#A function for extracting the desired information for a VSURF object.
impExtraction <- function(vObj) {
#Extract all the names from the input data.
allNames <- names(attr(vObj$terms,"dataClasses"))
#Remove the first value, which is the response.
allNames <- allNames[2:length(allNames)]
#Index the "interpretation" variables.
interpNames <- allNames[vObj$varselect.interp]
#Index the "prediction" variables.
predNames <- allNames[vObj$varselect.pred]
#Extract the importance index.
vImp <- vObj$imp.mean.dec[vObj$imp.mean.dec.ind]
#Create the base data frame for the "threshold" variables.
allDF <- data.frame(Parameter = allNames, VImean = vImp)
#Make a "Step" column for all variables. The default value is from the
# "Threshold" part of the output.
allDF$Step <- "Threshold"
allDF <- allDF %>%
#If Step contains a name from the interp step, then interp overwrites thres.
mutate(Step = ifelse(Parameter %in% interpNames, "Interpretation", Step),
#If Step contains a name from pred, then pred overwrites interp.
Step = ifelse(Parameter %in% predNames, "Prediction", Step),
#Reorder the Parameters basesd on the relative importance value.
Parameter = factor(Parameter, levels = Parameter[order(VImean,
decreasing = TRUE)]),
Step = factor(Step, levels = c("Threshold", "Interpretation",
"Prediction")))
allDF
}
#Extract the data.
impVarVDF <- impExtraction(impVarV)
Now let’s plot the data.
#Plot the data
impVarVDF %>%
ggplot(aes(Parameter, VImean, color = Step)) +
geom_point(size = 4) +
labs(x = NULL,
y = "Relative importance") +
ggtitle("Variables of importance from 'VSURF'") +
theme_bw() +
scale_color_brewer(palette = "Set1")
VSURF
uses a three step process for identifying important variables. The first step, “Threshold” looks at all the variables. “Interpretation” points out the variables that generate the highest accuracy models. “Prediction” extracts the variables that are most consistently important during the cross-validation phase.
In this case, “title” and “Sex” both make it the the Interpretation step, while only “title” makes it to the prediction phase. “berries” is at the bottom again, so that’s good.
Below, I apply both models for the Interpretation and Prediction steps and extract their confusion classes for the training and testing data sets.
#Apply the parsimonious model from `party`.
trainP <- train[, c("Survived", impNamesP)]
trainPRF <- cforest(Survived ~ ., data = trainP,
controls = cforest_unbiased(ntree = 1000))
trainTableP <- data.frame(obs = trainP$Survived, pred = predict(trainPRF))
CM$Training[["`party` variables"]] <- confusionMatrix(trainTableP$obs,
trainTableP$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableP <- data.frame(obs = test$Survived,
pred = predict(trainPRF, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`party` variables"]] <- confusionMatrix(testTableP$obs,
testTableP$pred, positive = "Survived")
#Apply the parsimonious model from `VSURF`.
#Column names associated with the Interpretation and Prediction steps.
impNamesVInt <- impVarVDF %>%
dplyr::filter((Step == "Interpretation" | Step == "Prediction")) %>%
mutate(Parameter = as.character(Parameter)) %>%
.$Parameter
#Column names of just the Prediction step.
impNamesVPre <- impVarVDF %>%
dplyr::filter(Step == "Prediction") %>%
mutate(Parameter = as.character(Parameter)) %>%
.$Parameter
trainVI <- train[, c("Survived", impNamesVInt)]
trainVRFI <- cforest(Survived ~ ., data = trainVI,
controls = cforest_unbiased(ntree = 1000))
trainTableVI <- data.frame(obs = trainVI$Survived, pred = predict(trainVRFI))
CM$Training[["`VSURF` Interp variables"]] <- confusionMatrix(trainTableVI$obs,
trainTableVI$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableVI <- data.frame(obs = test$Survived,
pred = predict(trainVRFI, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`VSURF` Interp variables"]] <- confusionMatrix(testTableVI$obs,
testTableVI$pred, positive = "Survived")
trainVP <- train[, c("Survived", impNamesVPre)]
trainVRFP <- cforest(Survived ~ ., data = trainVP,
controls = cforest_unbiased(ntree = 1000))
trainTableVP <- data.frame(obs = trainVP$Survived, pred = predict(trainVRFP))
CM$Training[["`VSURF` Pred variables"]] <- confusionMatrix(trainTableVP$obs,
trainTableVP$pred, positive = "Survived")
#Data frame containing the test set for cross validation.
testTableVP <- data.frame(obs = test$Survived,
pred = predict(trainVRFP, newdata = test))
#Confusion matrix for the cross-validation set.
CM$Testing[["`VSURF` Pred variables"]] <- confusionMatrix(testTableVP$obs,
testTableVP$pred, positive = "Survived")
I think it necessary to compare all the models that were just generated to understand how parsimony might have influenced our overall accuracy.
The CMExtraction
function is a convenience function for extracting the information in the CM
object I created. It works pretty well with the map
family of functions - a replacement to the apply
family of functions, but with a supposedly more consistent argument structure and output class.
#This function extracts some key summary data for all the
CMExtraction <- function(listObj) {
#A list to house the outputs of the for loop.
outList <- list()
for(i in 1:length(listObj)) {
#The name of the object at the list level.
scenName <- names(listObj[i])
#Extract the underlying list.
outObj <- listObj[[i]]
#Extract the mean accuracy level.
Acc <- outObj$overall["Accuracy"]
#The upper 95% CI.
AccUp <- outObj$overall["AccuracyUpper"]
#The lower 95% CI.
AccLo <- outObj$overall["AccuracyLower"]
#A data frame to house all the data.
# Note: used `data_frame` because I'm lazy and didn't want to specify string
# rules.
df <- data_frame(name = scenName,
accuracy = Acc,
accuracyUp = AccUp,
accuracyLo = AccLo)
#Get rid of any row names.
rownames(df) <- NULL
#Put the data frame into the output list.
outList[[i]] <- df
}
#Row bind all the data frames.
out <- purrr::map_df(outList, bind_rows)
#Return the output.
out
}
CMSummary <- purrr::map_df(CM, CMExtraction, .id = "id")
The final plot.
cP <- pals::kovesi.rainbow(12)[c(2,10)]
names(cP) <- NULL
CMSummary %>%
mutate(id = factor(id, levels = (c("Training", "Testing")))) %>%
ggplot(aes(name, accuracy, color = id)) +
geom_errorbar(aes(ymin = accuracyLo, ymax = accuracyUp),
position = position_dodge(width = 0.9), width = 0.3, show.legend = FALSE) +
geom_point(size = 4, position = position_dodge(width = 0.9)) +
labs(x = NULL,
y = "Accuracy") +
ggtitle("Random forest model comparisons: Training and testing") +
theme_bw() +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 10, hjust = 1)) +
scale_y_continuous(limits = c(0.6, 1), expand = c(0, 0)) +
scale_color_manual(values = cP)
Not surprisingly, it appears that including as much data as possible generates the most predictive results. However, there is a surprising amount of overlap between the most and least parsimonious models, “`VSURF` Pred variables” and “All variables”, respectively, implying someone’s title, and therefore social status, had a lot of influence on whether or not they survived the sinking of the Titanic.