Chapter 4 Distribution–histograms and density plots
library(tidyverse)
library(HistData)
library(ggpubr)
4.1 Histograms and bin choice
Seeing the smooth and rough of data bins or binwidth, default number of bins is 30
h10.plot = ggplot(data = diamonds.df, aes(price)) +
geom_histogram(bins = 10)
h30.plot = ggplot(data = diamonds.df, aes(price)) +
geom_histogram(bins = 30)
h80.plot = ggplot(data = diamonds.df, aes(price)) +
geom_histogram(bins = 80)
ggarrange(h10.plot, h30.plot, h80.plot,
nrow=1, ncol = 3, align = "h")
4.2 Density and kernel adjustment
Density as abstraction and model
Adjust is a multiplier on the default kernel bandwidth and so 1 represents the default
a10.plot = ggplot(data = diamonds.df, aes(price)) +
geom_density(adjust = 10)
a1.plot = ggplot(data = diamonds.df, aes(price)) +
geom_density(adjust = 1)
a01.plot = ggplot(data = diamonds.df, aes(price)) +
geom_density(adjust = 0.1)
ggarrange(a10.plot, a1.plot, a01.plot,
nrow=1, ncol = 3, align = "h")
4.3 Histogram percentage rather than count
library(scales)
ggplot(mtcars.df , aes(x = as.factor(cyl))) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels = scales::percent)
4.4 Histogram, density overlay, and normal overlay
4.5 Cummulative density
diamonds.df = diamonds
ggplot(diamonds.df, aes(price, colour = cut)) +
stat_ecdf(geom = "step")
4.6 Quantile-quantle plot
Plots quantiles of sample as a function of the quantiles of the theoretical distribution.
ggplot(diamonds.df, aes(sample = price)) +
geom_qq(distribution = qlnorm) +
geom_abline(intercept = mean(diamonds.df$price), slope = sd(diamonds.df$price))
4.7 Cummulative density
ggplot(diamonds.df, aes(price, colour = cut)) +
stat_ecdf(geom = "step")
4.8 Distribution: 2-D distribution and overplotting revisited
ggplot(diamonds.df, aes(log(carat), log(price)))+
geom_point(alpha = .01)+
theme_bw()
ggplot(diamonds.df, aes(log(carat), log(price)))+
geom_point(size = .5)+
geom_density2d(size=1.2)+
theme_bw()
ggplot(diamonds.df, aes(log(carat), log(price)))+
geom_point(size = .5)+
geom_density2d(size=1.2)+
geom_hex(alpha = .6) +
theme_bw()
ggplot(diamonds.df, aes(log(carat), log(price)))+
#geom_point( )+
#geom_point(size = .5)+
#geom_density2d(size=1.2)+
geom_hex(bins = 50) +
theme_bw()
4.9 Small multiple histogram with density and median reference lines
TODO change to diversity data gender across job types
diamonds.df = diamonds
sum.diamonds.df = diamonds.df %>% group_by(cut) %>%
summarise(q85 = quantile(price, 0.85))
ggplot(data = diamonds.df, aes(price)) +
geom_histogram(aes(y = ..density..), bins = 40) +
geom_density(colour = "darkblue") +
geom_vline(data = sum.diamonds.df, aes(xintercept = q85)) +
facet_grid(cut ~ .)
4.10 Ridge plot–An array of density plots
https://cran.r-project.org/web/packages/ggridges/vignettes/gallery.html
library(ggridges)
library(ggplot2movies)
movies %>% filter(year>1912, length<250) %>%
ggplot(aes(x = length, y = year, group = year)) +
geom_density_ridges(scale = 10, size = 0.25, rel_min_height = 0.03, alpha=.75) +
scale_x_continuous(limits=c(0, 250), expand = c(0.01, 0)) +
scale_y_reverse(breaks=c(2000, 1980, 1960, 1940, 1920, 1900), expand = c(0.01, 0)) +
theme_ridges()
## Picking joint bandwidth of 6.89