Data Visualization

Data-based Storytelling

Authors

Affiliations

Daniel Winkler

Institute for Retailing & Data Science

Stephan Fally

Department Marketing

Always Visualize!

Processing numbers is hard

Code

library(datasauRus)
library(dplyr)
library(dagitty)
library(ggdag)
library(ggplot2)
library(gt)
library(stringr)
library(tidyr)
library(scales)

tab_ds_data <- datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  group_by(dataset) |>
  mutate(id = 1:n()) |>
  pivot_wider(
    names_from = dataset, 
    values_from = c(x,y),
    names_vary = 'slowest'
  ) |>
  select(-id) |> 
  head(10) 
tab_ds_data |>  
  gt() |>
  cols_label(
    x_dino = 'x',
    y_dino = 'y',
    x_star = 'x',
    y_star = 'y'
  ) |>
  fmt_number() |>
  tab_spanner(
    label = "Dino",
    columns = ends_with("dino")
  ) |>
  cols_align('center') |> 
  tab_spanner(
    label = "Star",
    columns = ends_with("star")
  ) |> 
  tab_options(
    table.width = pct(85),
    table.font.size = 35)

Dino		Star
x	y	x	y
55.38	97.18	58.21	91.88
51.54	96.03	58.20	92.21
46.15	94.49	58.72	90.31
42.82	91.41	57.28	89.91
40.77	88.33	58.08	92.01
38.72	84.87	57.49	88.09
35.64	79.87	28.09	63.51
33.08	77.56	28.09	63.59
28.97	74.49	28.09	63.12
26.15	71.41	27.58	62.82

Summary statistics are limiting

Code

datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  group_by(dataset) |>
  summarize(
    avg_x = round(mean(x), digits = 2), 
    sd_x = round(sd(x), digits = 2),
    avg_y = round(mean(y), digits = 2),
    sd_y = round(sd(y), digits = 2),
    cor_xy = round(cor(x,y), digits = 2)
    ) |>
  mutate(dataset = str_to_title(dataset)) |> 
  gt() |>
  cols_label(
    avg_x = "Mean of x",
    sd_x = "Std. Dev. of x",
    avg_y = "Mean of y",
    sd_y = "Std. Dev. of y",
    cor_xy = "Correlation") |> 
  tab_options(
    table.width = pct(85),
    table.font.size = 35)

dataset	Mean of x	Std. Dev. of x	Mean of y	Std. Dev. of y	Correlation
Dino	54.26	16.77	47.83	26.94	-0.06
Star	54.27	16.77	47.84	26.93	-0.06

Code

datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  mutate(dataset = str_to_title(dataset)) |>
  ggplot(aes(x = x, y = y)) +
  geom_point() +
  facet_wrap(~dataset) +
  theme_minimal() +
  theme(
    panel.grid=element_blank(),
    strip.text=element_text(size=35),
    axis.ticks=element_blank(),
    axis.text=element_blank(),
    axis.title=element_text(size=30)
  )

Exercise

See beyond summary statistics

Select the datasets x_shape & bullseye from the data.frame datasaurus_dozen
Create a table showing the following statistics for the two datasets:
- mean of x and y,
- standard deviation of x and y, and
- covariance between x and y
Create a plot showing the two datasets

Here is some code to get you started:

Code

library(datasauRus)
library(dplyr)
library(ggplot2)
library(gt)
library(stringr)
library(tidyr)
filter(datasaurus_dozen, dataset %in% c('x_shape', 'bullseye')) |>
  str(give.attr = FALSE)

tibble [284 × 3] (S3: tbl_df/tbl/data.frame)
 $ dataset: chr [1:284] "x_shape" "x_shape" "x_shape" "x_shape" ...
 $ x      : num [1:284] 38.3 35.8 32.8 33.7 37.2 ...
 $ y      : num [1:284] 92.5 94.1 88.5 88.6 83.7 ...

Visual Channels

Visualizing different magnitudes

We can express the comparison of numbers in multiple ways
Some representations can be interpreted (by humans) more accurately
According to (franconeriScienceVisualData2021?) from most to least accurate:
- Position
- Length
- Area
- Angle
- Intensity

Visualizing “sameness”

Caution: these can create spurious interpretations

Code

related_data <- data.frame(
  x = c(rnorm(100, 0, 0.1), rnorm(100, 0.5, 0.1)), 
  y = c(rnorm(100, 0, 0.1), rnorm(100, 0.5, 0.1)))
ggplot(related_data, aes(x = x , y = y)) +
  geom_point(size = 1.2) +

  theme_minimal() +
  theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30)
  )

Counteract by using another principle

Code

related_data$g <- as.factor(ifelse(rbinom(200, 1, prob = 0.5) == 1, 'F', 'M'))
ggplot(related_data, aes(x = x , y = y, color = g, shape = g)) +
  geom_point(size = 4) +
  guides(colour = guide_legend(override.aes = list(size=7))) +
  theme_minimal() +
  theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.title = element_blank(),
    legend.position = 'top',
    legend.text = element_text(size = 25)
  )

Position is interpreted most accurately

Common y-axis alignment allows for accurate interpretation
Categories in the x-axis sorted by y-axis value
Position should encode the answer to the questions the audience asks

Code

library(palmerpenguins)
library(forcats)
fct_revfreq <- \(x) fct_rev(fct_infreq(x))
ggplot(penguins, aes(x = fct_revfreq(species))) +
  geom_bar(stat = "count") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    legend.text=element_text(size=25),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30)
  )

Lengths of unaligned segments are harder to compare

Are there more female Gentoo or female Adelie penguins?
Are there more male or female Gentoo penuins?

Code

ggplot(penguins, aes(x = fct_revfreq(species), color = sex, fill = sex)) +
  geom_bar(stat = "count") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

If the total number is not important but sub-group numbers are…

Are there more female Gentoo or female Adelie penguins?
Are there more male or female Gentoo penuins?

Code

ggplot(penguins, aes(x = fct_revfreq(species), color = sex, fill = sex)) +
  geom_bar(stat = "count", position = 'dodge') +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

Using pre-attentive attributes

Action potentials are means of communication between neurons in the brain
The probability of firing depends on the strength of the stimulus
The strength of the stimulus is relative to the background
All or nothing process
- Once the threshold is exceeded the neuron fires
Takeaway for visualization:
- Most important information needs to be “highlighted” relativ to all other information

How many “b”s are there?

top-down attention: deliberate & slow

Code

set.seed(1)
xy = expand.grid(seq(0,1, length.out = 5), seq(0,1, length.out = 5))
text = sample(c("a", "b", "c", "d", "e"), 25, replace = TRUE)
par(mar = c(0.1,0,0,0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n', xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text, cex = 3)

How many “b”s are there?

bottom-up attention: unconscious & fast

Code

par(mar = c(0,0,0,0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n',  xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text, cex = 3, col = ifelse(text == "b", "black", "gray"))

How many “ш”s are there?

Code

par(mar = c(0,0,0,0))
text_cyr <- sample(c("ш", "ц", "ж", "є", "ґ"), 25, replace = TRUE)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n',  xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text_cyr, cex = 3, col = ifelse(text_cyr == "ш", "black", "gray"))

Areas can be useful to add a third (least important) variable

Code

library(gapminder)
library(dplyr)
library(scales)
gm_last <- gapminder |>
  group_by(country) |>
  slice_max(year) |>
  mutate(hl = as.factor(ifelse(country %in% c("China", "United States"), country, 'other')))
ggplot(gm_last, aes(y = lifeExp, x = gdpPercap, size = pop, color = hl)) +
  geom_point() +
  scale_size(
    name = "Population",
    labels = label_number(scale_cut = cut_short_scale()),
    ) +
  geom_text(
    aes(label = country, x = gdpPercap, y = lifeExp),
    data = filter(gm_last, country %in% c('China', 'United States')),
    hjust = c(-0.1, 0.5),
    vjust = c(4, 2),
    size = 12
    ) +
  geom_segment(
  aes(
    x = gdpPercap + c(200, 0), 
    y = lifeExp - 1, 
    xend = gdpPercap+c(1000, 0), 
    yend = lifeExp-c(5.5, 2)),
  data = filter(gm_last, country %in% c('China', 'United States')),
  size = 0.5
  ) +
  scale_x_continuous(labels = label_comma()) +
  scale_color_manual(values = c('gray20', 'red', 'darkblue')) +
  guides(colour = 'none') +
  labs(y = "Life Expectancy", x = "GDP/Capita") +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_text(size=25)
  )

Do not use pie-charts… just don’t

Code

penguins |>
  group_by(species) |>
  summarize(count = n()) |>
ggplot(aes(x = "", y = count, fill = species)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Number of observations by species",
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks=element_blank(),
    axis.text=element_blank(),
    axis.title=element_blank(),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  ) +
  coord_polar("y", start=0)

Exercise

Perceptual accuracy

Create a graph that shows the population of penguins for the different species per island
It is of particular interest which species is the most common on each island
Create another graph that shows the analysis separately for the two sexes

Visualizing many categories (badly!)

What is the 5th smallest state?
Where does MO rank in terms of size?

Code

library(datasets)
library(scales)
ggplot(data.frame(abb = state.abb, area = state.area), aes(x = abb, y = area)) +
  geom_bar(stat = 'identity') +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    labels = label_comma(suffix = " sq mi")) + 
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=20),
    axis.title=element_text(size=30),
    axis.title.x=element_blank()
  )

Visualizing many categories

What is the 5th smallest state?
Where does MO rank in terms of size?

Code

library(colorspace)
library(forcats)
state_areas <- data.frame(abb = state.abb, area = state.area, name = state.name) |>
  mutate(abb = fct_reorder(abb, area))
ggplot(state_areas, aes(x = abb, y = area, 
  color = abb == 'MO', fill = abb == 'MO')) +
  geom_bar(stat = 'identity') +
  scale_y_continuous(
    expand = expansion(c(0,0.05)), breaks = seq(50000,600000, by = 100000),
    labels = label_comma(suffix = " sq mi")) + 
  scale_color_manual(values = c('gray85', 'red')) +
  scale_fill_manual(values = c('gray20', 'red')) +
  geom_text(
    aes(label=abb, y = area), 
    position=position_dodge(width=0.9), 
    vjust=-0.50, 
    color = ifelse(state_areas$abb=='MO', 'red', 'gray20'),
    size = 8)+
  labs(title = "US state areas") +
  theme_bw() +
    theme(
    plot.title=element_text(size=45),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=35),
    axis.text.x=element_blank(),
    axis.title=element_blank(),
    legend.position='none'
  )

Color intensity provides less accurate encoding

Code

library(maps)
library(sf)
library(colorspace)
us_map_data <- map("state", fill=TRUE, plot =FALSE)

usa <- st_as_sf(map("state", fill=TRUE, plot =FALSE))
usa <- merge(
  usa, 
  mutate(state_areas, ID = str_to_lower(name)),
  )
ggplot(usa) +
  geom_sf(aes(fill = area), color = "#2b2b2b", size=0.125) +
  coord_sf(crs = st_crs(6350)) +
  geom_sf_text(aes(label = abb, color = abb == 'MO')) +
  scale_color_manual(values = c('gray20', 'red')) +
  guides(color = 'none') + 
  ggthemes::theme_map() +
  scale_fill_binned_sequential(
    palette = "Heat", 
    labels = label_comma(suffix = " sq mi") 
    )

Colors are useful to mark targets for preattentive processing

Large scale spatial data needs to be “projected” to 2D
Most well known projection is 4326 measured in degrees latitude and longitude
However, there are more accurate projections for specific regions e.g, 3035 for EU+candidates

Code

## Example using "shapefile"
### e.g. for EU download here https://ec.europa.eu/eurostat/web/gisco/geodata/reference-data/administrative-units-statistical-units/nuts
eu <- st_read("data/NUTS_RG_60M_2021_3035.shp/", quiet = TRUE)
eu0 <- filter(eu, LEVL_CODE == 0)
## Transform to longitude and latitude
eu0 <- eu0 |> st_transform(4326)
eu0_box <- eu0 |> st_bbox()
eu0_box <- eu0_box + c(50, 20, 0, -10)
ggplot(st_crop(eu0, eu0_box))+ 
  geom_sf(fill = 'white', lwd = 1) +
  geom_sf(data = filter(eu0, CNTR_CODE == 'AT'), color = 'red', lwd =1.5) + 
  coord_sf(crs = st_crs(3035)) +
  geom_sf_text(aes(label = CNTR_CODE, color = CNTR_CODE == 'AT'), size = 15) +
  scale_color_manual(values = c('gray20', 'red')) +
  guides(color = 'none') + 
  ggthemes::theme_map(base_size = 12)

Exercise

Geospacial data

Obtain the shape (“SHP”) file for the EU + candidates from Eurostat
Crop the data to exclude overseas territories
Highlight the “DACH” region

“Connectedness” helps connecting the dots

Code

charts <- arrow::read_parquet(
    "data/chart_data/spotify_charts.parquet")  |> 
    filter(country %in% c("de", "fr")) |>
    group_by(country, date) |>
    summarize(total_streams = sum(streams)) 
charts |>
    ggplot(aes(x = date, y = log(total_streams))) +
    geom_point(aes(color = country), size = 4) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    scale_color_discrete_qualitative(palette = "Dark 2") +
    theme_classic() +
    labs(y = "log(total streams)", title = "Total streams 2023/24", subtitle = "Top 200 songs") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
        plot.title = element_text(size = 35),
        plot.subtitle = element_text(size = 35)
    )

“Connectedness” helps connecting the dots

Code

charts <- arrow::read_parquet(
    "data/chart_data/spotify_charts.parquet")  |> 
    filter(country %in% c("de", "fr")) |>
    group_by(country, date) |>
    summarize(total_streams = sum(streams)) 
charts |>
    ggplot(aes(x = date, y = log(total_streams), color = country)) +
    geom_point(size = 4) +
    geom_line() +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    scale_color_discrete_qualitative(palette = "Dark 2") +
    theme_classic() +
    labs(y = "log(total streams)", title = "Total streams 2023/24", subtitle = "Top 200 songs") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
        plot.title = element_text(size = 35),
        plot.subtitle = element_text(size = 35)
    )

Use scatter plots to show correlation

Code

penguins |>
ggplot(aes(x = bill_length_mm, y = bill_depth_mm, #color = species
)) +
    geom_point(size = 4) +
    geom_smooth(method = 'lm', se = FALSE) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    theme_classic() +
    labs(y = "Bill depth", x = "Bill length") +
    theme(
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
    )

Beware of “Simpson’s Paradox”

When between-group correlation is the opposite of within-group correlation

Code

penguins |>
ggplot(aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
    geom_point(size = 4) +
    geom_smooth(method = 'lm', se = FALSE) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    theme_classic() +
    labs(y = "Bill depth", x = "Bill length") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        legend.position = 'top',
    )

Use Boxplots to show distributions

Code

penguins |>
ggplot(aes(
  y = bill_length_mm, 
  x = fct_reorder(species, bill_length_mm, .fun = median, .na_rm = TRUE))) +
    geom_boxplot(na.rm = TRUE) +
    theme_classic() +
    labs(y = "Bill length") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
    )

Visualizing Uncertainty

Recap DAGs \(\rightarrow\) observational data
Experiments
Model choice
Visualization

Analyzing DAGs: Pipe

Bad Control (possibly use mediation analysis)

Code

med <- dagify(z ~ x, y2 ~ z,
       coords = list(x = c(x = 1, z = 1.5, y2 = 2), y = c(x=1, y2 = 1, z=1))
) %>% 
  tidy_dagitty() %>%
  mutate(fill = ifelse(name == "z", "Mediator", "variables of interest")) %>% 
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + 
  geom_dag_point(size=7, aes(color = fill)) + 
  geom_dag_edges(show.legend = FALSE)+
  geom_dag_text() +
  theme_dag() +
  theme(legend.title  = element_blank(),
        legend.position = "top") 
med

x causes y through z
Controlling for z blocks the causal association x \(\rightarrow\) y2

Analyzing DAGs: Collider

Bad control

Code

dagify(a ~ x, a ~ y,
  coords = list(x = c(x = 1, y = 2, a = 1.5), y = c(x = 1, y = 0,  a = 0))
) |>
  tidy_dagitty() |>
  mutate(fill = ifelse(name == "a", "Collider", "variables of interest")) |>
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_dag_point(size = 7, aes(color = fill)) +
  geom_dag_edges(show.legend = FALSE) +
  geom_dag_text() +
  theme_dag() +
  theme(
    legend.title = element_blank(),
    legend.position = "top"
  )

x & y cause a
x & y are d-separated and uncorrelated
By adding a to the model spurious correlation between x & y is introduced

Analyzing DAGs: Fork

Good Control

Code

med <- dagify( x ~ d, y1 ~ d,
       coords = list(x = c(x = 1, z = 1.5, y = 2, a = 1.5, b = 2, d = 1.5, y1 = 2), 
                     y = c(x = 1, y = 1, z = 1, a = 0, b = 0, d = 2, y1 = 2))
) %>% 
  tidy_dagitty() %>%
  mutate(fill = ifelse(name == "d", "Confounder", "variables of interest")) %>% 
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + 
  geom_dag_point(size=7, aes(color = fill)) + 
  geom_dag_edges(show.legend = FALSE)+
  geom_dag_text() +
  theme_dag() +
  theme(legend.title  = element_blank(),
        legend.position = "top") 
med

d causes both x and y1
Arrows pointing to x are called “back-door” paths
Controlling for d “blocks” the non-causal association x \(\rightarrow\) y1

. . .

Eliminated by randomized experiment! Why?

Randomized experiments

Researcher manipulates the variable of interest (x)
Holding all other variables (confounders) constant
To check the “pure” causal effect of the manipulation on the relevant outcome (y)
Intuition: variation in y has to be due to x as all other things are kept constant

Code

exper <- dagify( x ~ d, y ~ d, y ~ x,
       coords = list(x = c(x = 1, d = 1.5, y = 2), 
                     y = c(x = 0, d = 1, y = 0))
) |> 
  tidy_dagitty() %>%
  mutate(fill = ifelse(name == "d", "Confounder", "variables of interest")) %>% 
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + 
  geom_dag_point(size=7, aes(color = fill)) + 
  geom_dag_edges(
    show.legend = FALSE, 
    aes(
      edge_color = ifelse(name == 'd', 'gray80', 'darkgreen'),
      edge_linetype = ifelse(name == 'd', 'dashed', 'solid')
      ))+
  geom_dag_text() +
  theme_dag() +
  theme(legend.title  = element_blank(),
        legend.position = "top") +
  geom_text(aes(x = 1.5, y = 1, label = '← kept constant'), hjust = -0.3)
exper

Notes on choosing a model

Often models assume a distribution of the error term
- e.g., linear regression: normal distribution
We can choose a model based on its assumptions
- e.g., continuous data: linear regression
- e.g., binary data: logistic regression
Specialized methods for dependent observations
- e.g., paired t-test \(\rightarrow\) test on difference of within-subject design
Specialized methods for violations of assumptions
- e.g., Wilcoxon signed-rank test as alternative to t-test

Visualizing Uncertainty: Simple experiments

Outputs from statistical models should generally include:
- Effect size
- Test statistic calculated (incl. paramters) & p-value
- It is typical to include an interpretation of significance
- However, this is not recommended by statistical community. Wasserstein, Schirm, and Lazar (2019):

“if p = 0.03 then “…was statistically significant” would be replaced by “…had p = 0.03,””

The ggstatsplot library (Patil 2021) provides useful functions to calculate and visualize statistical tests and model parameters
Follows best practices
ggbetweenstats & ggwithinstats for between- and within-subject experimental designs
- Chooses automatically between t-test (2 groups) and ANOVA (3+ groups)
- Large number of tests available
- Choose type='nonparametric' for versions that are robust to violations of the normal-distribution assumption
- Choose type='robust' for versions that are robust to a few outliers

Code

library(ggstatsplot)
library(palmerpenguins)
ggbetweenstats(
  data = penguins,
  x = species,
  y = body_mass_g,
  type = 'nonparametric',
  violin.args = list(width = 0, linewidth=0),
  point.args = list(alpha = 0),
  bf.message = FALSE
) + labs(y = "body mass")

Visualizing uncertainty: Regression models

Use the ggcoefstats function to visualize the results of (currently) \(225\) different types of regression models

Code

ggcoefstats(
  lm(body_mass_g ~ species + sex, data = penguins)
) + labs(caption = 'Error bars show 95% confidence interval')

Visualizing uncertainty: Regression models

Coefficient names can be modified by providing a data.frame with the relevant statistics instead of the model

Code

body_mass_mod <- lm(body_mass_g ~ species + sex, data = penguins)
tidy_coefs <- broom::tidy(body_mass_mod)
tidy_coefs$df.error <- body_mass_mod$df.residual
tidy_coefs <- confint(body_mass_mod) |> 
  data.frame() |> 
  dplyr::rename(conf.low = 'X2.5..', conf.high = 'X97.5..') |>
  cbind(tidy_coefs) |>
  mutate(term = str_replace_all(
    term, c(
      sexmale = "male (vs. female)",
      speciesGentoo = "Gentoo (vs. Adelie)",
      speciesChinstrap = "Chinstrap (vs. Adelie)"
    )))
ggcoefstats(
  tidy_coefs, 
  statistic = 't',
  exclude.intercept = TRUE
  ) + 
  labs(title = 'DV: body mass (g)', caption = 'Intercept omitted; error bars show 95% confidence interval') +
  scale_x_continuous(labels = label_number(suffix = 'g')) +
  theme(axis.title.y = element_blank())

A few notes on colors

The library(colorspace) has convenient functions for picking correct color patterns
Palette type depends on the scaling of the data visualized
- Is the data discrete or continuous?
- Is there an ordering?
  - Is the ordering diverging (i.e., there is a neutral state)?
There are convenience functions for ggplot

scale_<aesthetic>_<datatype>_<colorscale>()

aesthetics: fill, color
datatypes: discrete, continuous, binned
colorscales: qualitative, sequential, diverging

Diverging colors: correlation matrix

Diverging from -1 to 1 with 0 as a neutral point

corrs <- palmerpenguins::penguins |> 
  drop_na() |> 
  select(bill_length_mm, bill_depth_mm, flipper_length_mm) |>
  cor()
cor_df <- data.frame(cor = c(corrs), var1 = factor(col(corrs)), var2 = factor(row(corrs)))
ggplot(cor_df, aes(var1, var2, fill = cor)) + 
  geom_tile() + 
  coord_fixed() +
  ylab("variable") +
  scale_x_discrete(position = "top", name = "variable") +
  scale_fill_continuous_diverging("Blue-Red 3")

Qualitative colors: Unordered groups

Rule of thumb: visualize up to 7 groups with colors
More become hard to differentiate

ggplot(penguins, 
  aes(x = fct_revfreq(species), 
      color = sex, fill = sex)) +
  geom_bar(stat = "count", position = 'dodge') +
  scale_fill_discrete_qualitative("pastel 1") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

Color Vision Deficiency

~8% of men and ~0.5% of women have color vision deficiency (CVD) (“color blindness”)
We can simulate CVD using swatchplot(<palette>, cvd = TRUE)

swatchplot(diverging_hcl(7, 'Red-Green'), cvd = TRUE)

References

Patil, Indrajeet. 2021. “Visualizations with statistical details: The ’ggstatsplot’ approach.” Journal of Open Source Software 6 (61): 3167. https://doi.org/10.21105/joss.03167.

Wasserstein, Ronald L., Allen L. Schirm, and Nicole A. Lazar. 2019. “Moving to a World Beyond ‘p < 0.05’.” The American Statistician 73 (March): 1–19. https://doi.org/10.1080/00031305.2019.1583913.