R - titanic dataset visualization
Topic
In this part, we will deal with titanic data set. There are 4 main columns in this table namely Pclass, Sex, Age, Survived. Try to find any insight information about survival rate of these people (e.g. do their class affect the odds of them surviving? Are women and children really being prioritized for rescue? etc.). You can use any type of graph available on “ggplot2” package: box plot, scatter plot, bar plot, etc. Remember to provide comments on your produced plots.
Note: In Survived column, 1 denote the customer survived the disaster and vice versa.
Preparation
# Libraries needed
library(tidyverse)
library(grid)
library(gridExtra)
Load data
# Load data
= read_csv("titanic.csv")
titanic
# Select needed columns
= titanic %>% select(Pclass, Sex, Age, Survived)
dat
# Missing values handling
= is.na(dat$Age)
na_val = dat[!na_val,] dat
dim(dat)
## [1] 714 4
The new dataset has 4 columns and 714 rows
# Support function
= "Nam Hoang"
author = function(data, fill_data, title=NULL, author=NULL) {
draw_pie # Draw pie chart
= data %>%
fig mutate(perc = `n` / sum(`n`)) %>%
mutate(labels = scales::percent(perc)) %>%
ggplot(aes(x="", n, fill=fill_data)) +
geom_bar(stat="identity", width=1, color = "black") +
geom_text(aes(label = labels),
position = position_stack(vjust = 0.5)) +
coord_polar("y", start = 0)
if (is.null(title)) {
= ""
title
}
if (is.null(author)) {
= fig + labs(x=NULL, y=NULL, title=title)
fig else {
} = fig + labs(x=NULL, y=NULL, title=title,
fig caption = paste("Author:", author, sep=" "))
}
= fig + theme_void()
fig return (fig)
}
Passenger class
= dat %>% mutate(Pclass=as.factor(Pclass)) %>%
temp group_by(Pclass) %>% count() %>% ungroup()
draw_pie(temp, temp$Pclass, title="Pclass distribution", author=author)
= dat %>% filter(Survived==1) %>%
temp mutate(Pclass=as.factor(Pclass)) %>%
group_by(Pclass) %>% count() %>% ungroup()
= draw_pie(temp, temp$Pclass,
p1 title="Pclass distribution (survived)", author=author)
= dat %>% filter(Survived==0) %>%
temp mutate(Pclass=as.factor(Pclass)) %>%
group_by(Pclass) %>% count() %>% ungroup()
= draw_pie(temp, temp$Pclass,
p2 title="Pclass distribution (died)", author=author)
grid.arrange(p1, p2, ncol=2)
Passenger class 3 occupied a great proportion with just around 50% of the total passengers. Class 1 and 2 are evenly distributed. Passenger in class 3 has the most death (63.7%) while death rate of class 1 is the lowest (15.1%).
Sex
= dat %>% group_by(Sex) %>% count() %>% ungroup()
temp draw_pie(temp, temp$Sex, title="Sex distribution", author=author)
= dat %>% filter(Survived==1) %>%
temp group_by(Sex) %>% count() %>% ungroup()
= draw_pie(temp, temp$Sex,
p3 title="Sex distribution (survived)", author=author)
= dat %>% filter(Survived==0) %>%
temp group_by(Sex) %>% count() %>% ungroup()
= draw_pie(temp, temp$Sex,
p4 title="Sex distribution (died)", author=author)
grid.arrange(p3, p4, ncol=2)
Female passengers almost doubled male passengers. Most of the male passengers died in the accident.
Age group
# Age group
= c("0-14", "15-24", "25-64", "65+")
age_gr # age_gr = c("Children", "Youth", "Adults", "Seniors")
= dat %>% mutate(Age_group = cut(
temp breaks = c(0, 15, 25, 65, Inf), labels = age_gr, right = FALSE)
Age, %>% group_by(Age_group) %>% count() %>%
) ungroup()
draw_pie(temp, temp$Age_group,
title="Age group distribution", author=author)
= dat %>% filter(Survived==1) %>%
temp mutate(Age_group = cut(
breaks = c(0, 15, 25, 65, Inf), labels = age_gr, right = FALSE)
Age, %>% group_by(Age_group) %>% count() %>%
) ungroup()
= draw_pie(temp, temp$Age_group,
p5 title="Age group distribution (survived)", author=author)
= dat %>% filter(Survived==0) %>%
temp mutate(Age_group = cut(
breaks = c(0, 15, 25, 65, Inf), labels = age_gr, right = FALSE)
Age, %>% group_by(Age_group) %>% count() %>%
) ungroup()
= draw_pie(temp, temp$Age_group,
p6 title="Age group distribution (died)", author=author)
grid.arrange(p5, p6, ncol=2)
ggplot(dat) +
geom_histogram(aes(x = Age), bins = 35,
color = "black", fill= "#f8766d") +
labs(title = "Age histogram",
caption = paste("Author:", author, sep=" "))
Most of the passengers are adults (59.5%), people in age group from 15-24 took the second place with 28%. The percentage of survived and died passengers are relatively equal.
Survived
= dat %>% mutate(Survived=as.factor(Survived)) %>%
temp group_by(Survived) %>% count() %>% ungroup()
draw_pie(temp, temp$Survived, title="Survived distribution", author=author)
So far, only 41% of the passengers survived.