Data Visualization

Data Literacy

Always Visualize!

Processing numbers is hard

Code
library(datasauRus)
library(dplyr)
library(ggplot2)
library(gt)
library(stringr)
library(tidyr)

tab_ds_data <- datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  group_by(dataset) |>
  mutate(id = 1:n()) |>
  pivot_wider(
    names_from = dataset, 
    values_from = c(x,y),
    names_vary = 'slowest'
  ) |>
  select(-id) |> 
  head(10) 
tab_ds_data |>  
  gt() |>
  cols_label(
    x_dino = 'x',
    y_dino = 'y',
    x_star = 'x',
    y_star = 'y'
  ) |>
  fmt_number() |>
  tab_spanner(
    label = "Dino",
    columns = ends_with("dino")
  ) |>
  cols_align('center') |> 
  tab_spanner(
    label = "Star",
    columns = ends_with("star")
  ) |> 
  tab_options(
    table.width = pct(85),
    table.font.size = 35) 
Dino Star
x y x y
55.38 97.18 58.21 91.88
51.54 96.03 58.20 92.21
46.15 94.49 58.72 90.31
42.82 91.41 57.28 89.91
40.77 88.33 58.08 92.01
38.72 84.87 57.49 88.09
35.64 79.87 28.09 63.51
33.08 77.56 28.09 63.59
28.97 74.49 28.09 63.12
26.15 71.41 27.58 62.82

Summary statistics are limiting

Code
datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  group_by(dataset) |>
  summarize(
    avg_x = round(mean(x), digits = 2), 
    sd_x = round(sd(x), digits = 2),
    avg_y = round(mean(y), digits = 2),
    sd_y = round(sd(y), digits = 2),
    cor_xy = round(cor(x,y), digits = 2)
    ) |>
  mutate(dataset = str_to_title(dataset)) |> 
  gt() |>
  cols_label(
    avg_x = "Mean of x",
    sd_x = "Std. Dev. of x",
    avg_y = "Mean of y",
    sd_y = "Std. Dev. of y",
    cor_xy = "Correlation") |> 
  tab_options(
    table.width = pct(85),
    table.font.size = 35) 
dataset Mean of x Std. Dev. of x Mean of y Std. Dev. of y Correlation
Dino 54.26 16.77 47.83 26.94 -0.06
Star 54.27 16.77 47.84 26.93 -0.06
Code
datasaurus_dozen |>
  filter(dataset %in% c('dino', 'star')) |>
  mutate(dataset = str_to_title(dataset)) |>
  ggplot(aes(x = x, y = y)) +
  geom_point() +
  facet_wrap(~dataset) +
  theme_minimal() +
  theme(
    panel.grid=element_blank(),
    strip.text=element_text(size=35),
    axis.ticks=element_blank(),
    axis.text=element_blank(),
    axis.title=element_text(size=30)
  )

Exercise

See beyond summary statistics

  • Select the datasets x_shape & bullseye from the data.frame datasaurus_dozen
  • Create a table showing the following statistics for the two datasets:
    • mean of x and y,
    • standard deviation of x and y, and
    • covariance between x and y
  • Create a plot showing the two datasets

Here is some code to get you started:

Code
library(datasauRus)
library(dplyr)
library(ggplot2)
library(gt)
library(stringr)
library(tidyr)
filter(datasaurus_dozen, dataset %in% c('x_shape', 'bullseye')) |>
  str(give.attr = FALSE)
tibble [284 × 3] (S3: tbl_df/tbl/data.frame)
 $ dataset: chr [1:284] "x_shape" "x_shape" "x_shape" "x_shape" ...
 $ x      : num [1:284] 38.3 35.8 32.8 33.7 37.2 ...
 $ y      : num [1:284] 92.5 94.1 88.5 88.6 83.7 ...

Visual Channels

Visualizing different magnitudes

  • We can express the comparison of numbers in multiple ways
  • Some representations can be interpreted (by humans) more accurately
  • According to Franconeri et al. (2021) from most to least accurate:
    • Position
    • Length
    • Area
    • Angle
    • Intensity

Visualizing “sameness”

Caution: these can create spurious interpretations

Code
related_data <- data.frame(
  x = c(rnorm(100, 0, 0.1), rnorm(100, 0.5, 0.1)), 
  y = c(rnorm(100, 0, 0.1), rnorm(100, 0.5, 0.1)))
ggplot(related_data, aes(x = x , y = y)) +
  geom_point(size = 1.2) +

  theme_minimal() +
  theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30)
  )

Counteract by using another principle

Code
related_data$g <- as.factor(ifelse(rbinom(200, 1, prob = 0.5) == 1, 'F', 'M'))
ggplot(related_data, aes(x = x , y = y, color = g, shape = g)) +
  geom_point(size = 4) +
  guides(colour = guide_legend(override.aes = list(size=7))) +
  theme_minimal() +
  theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.title = element_blank(),
    legend.position = 'top',
    legend.text = element_text(size = 25)
  )

Position is interpreted most accurately

  • Common y-axis alignment allows for accurate interpretation
  • Categories in the x-axis sorted by y-axis value
  • Position should encode the answer to the questions the audience asks
Code
library(palmerpenguins)
library(forcats)
fct_revfreq <- \(x) fct_rev(fct_infreq(x))
ggplot(penguins, aes(x = fct_revfreq(species))) +
  geom_bar(stat = "count") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    legend.text=element_text(size=25),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30)
  )

Lengths of unaligned segments are harder to compare

  • Are there more female Gentoo or female Adelie penguins?
  • Are there more male or female Gentoo penuins?
Code
ggplot(penguins, aes(x = fct_revfreq(species), color = sex, fill = sex)) +
  geom_bar(stat = "count") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

If the total number is not important but sub-group numbers are…

  • Are there more female Gentoo or female Adelie penguins?
  • Are there more male or female Gentoo penuins?
Code
ggplot(penguins, aes(x = fct_revfreq(species), color = sex, fill = sex)) +
  geom_bar(stat = "count", position = 'dodge') +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

Using pre-attentive attributes

  • Action potentials are means of communication between neurons in the brain
  • The probability of firing depends on the strength of the stimulus
  • The strength of the stimulus is relative to the background
  • All or nothing process
    • Once the threshold is exceeded the neuron fires
  • Takeaway for visualization:
    • Most important information needs to be “highlighted” relativ to all other information

How many “b”s are there?

  • top-down attention: deliberate & slow
Code
set.seed(1)
xy = expand.grid(seq(0,1, length.out = 5), seq(0,1, length.out = 5))
text = sample(c("a", "b", "c", "d", "e"), 25, replace = TRUE)
par(mar = c(0.1,0,0,0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n', xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text, cex = 3)

How many “b”s are there?

  • bottom-up attention: unconscious & fast
Code
par(mar = c(0,0,0,0))
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n',  xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text, cex = 3, col = ifelse(text == "b", "black", "gray"))

How many “ш”s are there?

Code
par(mar = c(0,0,0,0))
text_cyr <- sample(c("ш", "ц", "ж", "є", "ґ"), 25, replace = TRUE)
plot(c(0, 1), c(0, 1), ann = F, bty = 'n',type = 'n',  xaxt = 'n', yaxt = 'n')
text(xy[,1], xy[,2], text_cyr, cex = 3, col = ifelse(text_cyr == "ш", "black", "gray"))

Areas can be useful to add a third (least important) variable

Code
library(gapminder)
library(dplyr)
library(scales)
gm_last <- gapminder |>
  group_by(country) |>
  slice_max(year) |>
  mutate(hl = as.factor(ifelse(country %in% c("China", "United States"), country, 'other')))
ggplot(gm_last, aes(y = lifeExp, x = gdpPercap, size = pop, color = hl)) +
  geom_point() +
  scale_size(
    name = "Population",
    labels = label_number(scale_cut = cut_short_scale()),
    ) +
  geom_text(
    aes(label = country, x = gdpPercap, y = lifeExp),
    data = filter(gm_last, country %in% c('China', 'United States')),
    hjust = c(-0.1, 0.5),
    vjust = c(4, 2),
    size = 12
    ) +
  geom_segment(
  aes(
    x = gdpPercap + c(200, 0), 
    y = lifeExp - 1, 
    xend = gdpPercap+c(1000, 0), 
    yend = lifeExp-c(5.5, 2)),
  data = filter(gm_last, country %in% c('China', 'United States')),
  size = 0.5
  ) +
  scale_x_continuous(labels = label_comma()) +
  scale_color_manual(values = c('gray20', 'red', 'darkblue')) +
  guides(colour = 'none') +
  labs(y = "Life Expectancy", x = "GDP/Capita") +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_text(size=25)
  )

Do not use pie-charts… just don’t

Code
penguins |>
  group_by(species) |>
  summarize(count = n()) |>
ggplot(aes(x = "", y = count, fill = species)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Number of observations by species",
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks=element_blank(),
    axis.text=element_blank(),
    axis.title=element_blank(),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  ) +
  coord_polar("y", start=0) 

Exercise

Perceptual accuracy

  • Create a graph that shows the population of penguins for the different species per island
  • It is of particular interest which species is the most common on each island
  • Create another graph that shows the analysis separately for the two sexes

Visualizing many categories (badly!)

  • What is the 5th smallest state?
  • Where does MO rank in terms of size?
Code
library(datasets)
library(scales)
ggplot(data.frame(abb = state.abb, area = state.area), aes(x = abb, y = area)) +
  geom_bar(stat = 'identity') +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    labels = label_comma(suffix = " sq mi")) + 
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=20),
    axis.title=element_text(size=30),
    axis.title.x=element_blank()
  )

Visualizing many categories

  • What is the 5th smallest state?
  • Where does MO rank in terms of size?
Code
library(colorspace)
library(forcats)
state_areas <- data.frame(abb = state.abb, area = state.area, name = state.name) |>
  mutate(abb = fct_reorder(abb, area))
ggplot(state_areas, aes(x = abb, y = area, 
  color = abb == 'MO', fill = abb == 'MO')) +
  geom_bar(stat = 'identity') +
  scale_y_continuous(
    expand = expansion(c(0,0.05)), breaks = seq(50000,600000, by = 100000),
    labels = label_comma(suffix = " sq mi")) + 
  scale_color_manual(values = c('gray85', 'red')) +
  scale_fill_manual(values = c('gray20', 'red')) +
  geom_text(
    aes(label=abb, y = area), 
    position=position_dodge(width=0.9), 
    vjust=-0.50, 
    color = ifelse(state_areas$abb=='MO', 'red', 'gray20'),
    size = 8)+
  labs(title = "US state areas") +
  theme_bw() +
    theme(
    plot.title=element_text(size=45),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=35),
    axis.text.x=element_blank(),
    axis.title=element_blank(),
    legend.position='none'
  )

Color intensity provides less accurate encoding

Code
library(maps)
library(sf)
library(colorspace)
us_map_data <- map("state", fill=TRUE, plot =FALSE)

usa <- st_as_sf(map("state", fill=TRUE, plot =FALSE))
usa <- merge(
  usa, 
  mutate(state_areas, ID = str_to_lower(name)),
  )
ggplot(usa) +
  geom_sf(aes(fill = area), color = "#2b2b2b", size=0.125) +
  coord_sf(crs = st_crs(6350)) +
  geom_sf_text(aes(label = abb, color = abb == 'MO')) +
  scale_color_manual(values = c('gray20', 'red')) +
  guides(color = 'none') + 
  ggthemes::theme_map() +
  scale_fill_binned_sequential(
    palette = "Heat", 
    labels = label_comma(suffix = " sq mi") 
    )

Colors are useful to mark targets for preattentive processing

  • Large scale spatial data needs to be “projected” to 2D
  • Most well known projection is 4326 measured in degrees latitude and longitude
  • However, there are more accurate projections for specific regions e.g, 3035 for EU+candidates
Code
## Example using "shapefile"
### e.g. for EU download here https://ec.europa.eu/eurostat/web/gisco/geodata/reference-data/administrative-units-statistical-units/nuts
eu <- st_read("data/NUTS_RG_60M_2021_3035.shp/", quiet = TRUE)
eu0 <- filter(eu, LEVL_CODE == 0)
## Transform to longitude and latitude
eu0 <- eu0 |> st_transform(4326)
eu0_box <- eu0 |> st_bbox()
eu0_box <- eu0_box + c(50, 20, 0, -10)
ggplot(st_crop(eu0, eu0_box))+ 
  geom_sf(fill = 'white', lwd = 1) +
  geom_sf(data = filter(eu0, CNTR_CODE == 'AT'), color = 'red', lwd =1.5) + 
  coord_sf(crs = st_crs(3035)) +
  geom_sf_text(aes(label = CNTR_CODE, color = CNTR_CODE == 'AT'), size = 15) +
  scale_color_manual(values = c('gray20', 'red')) +
  guides(color = 'none') + 
  ggthemes::theme_map(base_size = 12) 

Exercise

Geospacial data

  • Obtain the shape (“SHP”) file for the EU + candidates from Eurostat
  • Crop the data to exclude overseas territories
  • Highlight the “DACH” region

“Connectedness” helps connecting the dots

Code
charts <- arrow::read_parquet(
    "data/chart_data/spotify_charts.parquet")  |> 
    filter(country %in% c("de", "fr")) |>
    group_by(country, date) |>
    summarize(total_streams = sum(streams)) 
charts |>
    ggplot(aes(x = date, y = log(total_streams))) +
    geom_point(aes(color = country), size = 4) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    scale_color_discrete_qualitative(palette = "Dark 2") +
    theme_classic() +
    labs(y = "log(total streams)", title = "Total streams 2023/24", subtitle = "Top 200 songs") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
        plot.title = element_text(size = 35),
        plot.subtitle = element_text(size = 35)
    )

“Connectedness” helps connecting the dots

Code
charts <- arrow::read_parquet(
    "data/chart_data/spotify_charts.parquet")  |> 
    filter(country %in% c("de", "fr")) |>
    group_by(country, date) |>
    summarize(total_streams = sum(streams)) 
charts |>
    ggplot(aes(x = date, y = log(total_streams), color = country)) +
    geom_point(size = 4) +
    geom_line() +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    scale_color_discrete_qualitative(palette = "Dark 2") +
    theme_classic() +
    labs(y = "log(total streams)", title = "Total streams 2023/24", subtitle = "Top 200 songs") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
        plot.title = element_text(size = 35),
        plot.subtitle = element_text(size = 35)
    )

Use scatter plots to show correlation

Code
penguins |>
ggplot(aes(x = bill_length_mm, y = bill_depth_mm, #color = species
)) +
    geom_point(size = 4) +
    geom_smooth(method = 'lm', se = FALSE) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    theme_classic() +
    labs(y = "Bill depth", x = "Bill length") +
    theme(
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
    )

Beware of “Simpson’s Paradox”

  • When between-group correlation is the opposite of within-group correlation
Code
penguins |>
ggplot(aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
    geom_point(size = 4) +
    geom_smooth(method = 'lm', se = FALSE) +
    guides(colour = guide_legend(override.aes = list(size=7))) +
    theme_classic() +
    labs(y = "Bill depth", x = "Bill length") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        legend.position = 'top',
    )

Use Boxplots to show distributions

Code
penguins |>
ggplot(aes(
  y = bill_length_mm, 
  x = fct_reorder(species, bill_length_mm, .fun = median, .na_rm = TRUE))) +
    geom_boxplot(na.rm = TRUE) +
    theme_classic() +
    labs(y = "Bill length") +
    theme(
        legend.text = element_text(size = 35),
        legend.title = element_blank(),
        axis.text=element_text(size=35),
        axis.title=element_text(size=35),
        axis.title.x=element_blank(),
        legend.position = 'top',
    )

A few notes on colors

  • The library(colorspace) has convenient functions for picking correct color patterns
  • Palette type depends on the scaling of the data visualized
    • Is the data discrete or continuous?
    • Is there an ordering?
      • Is the ordering diverging (i.e., there is a neutral state)?
  • There are convenience functions for ggplot

scale_()

  • aesthetics: fill, color
  • datatypes: discrete, continuous, binned
  • colorscales: qualitative, sequential, diverging

Diverging colors: correlation matrix

  • Diverging from -1 to 1 with 0 as a neutral point
corrs <- palmerpenguins::penguins |> 
  drop_na() |> 
  select(bill_length_mm, bill_depth_mm, flipper_length_mm) |>
  cor()
cor_df <- data.frame(cor = c(corrs), var1 = factor(col(corrs)), var2 = factor(row(corrs)))
ggplot(cor_df, aes(var1, var2, fill = cor)) + 
  geom_tile() + 
  coord_fixed() +
  ylab("variable") +
  scale_x_discrete(position = "top", name = "variable") +
  scale_fill_continuous_diverging("Blue-Red 3")

Qualitative colors: Unordered groups

  • Rule of thumb: visualize up to 7 groups with colors
  • More become hard to differentiate
ggplot(penguins, 
  aes(x = fct_revfreq(species), 
      color = sex, fill = sex)) +
  geom_bar(stat = "count", position = 'dodge') +
  scale_fill_discrete_qualitative("pastel 1") +
  scale_y_continuous(
    expand = expansion(c(0,0.05)),
    breaks = seq(10,150, by = 20)) + 
  labs(
    title = "Number of observations by species and sex",
    x = "Penguin Species"
    ) +
  theme_bw() +
    theme(
    plot.title=element_text(size=35),
    axis.ticks.x=element_blank(),
    axis.text=element_text(size=30),
    axis.title=element_text(size=30),
    legend.position='top',
    legend.text=element_text(size=25),
    legend.title=element_blank()
  )

Color Vision Deficiency

  • ~8% of men and ~0.5% of women have color vision deficiency (CVD) (“color blindness”)
  • We can simulate CVD using swatchplot(<palette>, cvd = TRUE)
swatchplot(diverging_hcl(7, 'Red-Green'), cvd = TRUE)

References

Franconeri, Steven L., Lace M. Padilla, Priti Shah, Jeffrey M. Zacks, and Jessica Hullman. 2021. “The Science of Visual Data Communication: What Works.” Psychological Science in the Public Interest 22 (3): 110–61. https://doi.org/10.1177/15291006211051956.