a06_0_Poisson_Negative_Binomial.Rmd
Ben bolker, one of the main creators of the glmmTMB
R package, wrote this: https://ms.mcmaster.ca/~bolker/emdbook/book.pdf
I also think you should own Advanced Statistics with Application in R by Demidenko
Links:
Illustrate how the Negative Binomial distribution
Conduct a simulation study with a unique set of conditions and compute the Type I error and Power. Remember to use the save()
and load()
functions. Use the R code provided below.
library(SPR)
library(MASS)
N = 1e5 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(2012021)
dat <- data.frame(
'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
#'Y_nb' = rep(NA, N*number.timepoints),
#'Y_pois' = rep(NA, N*number.timepoints),
#'Bio' = rep(rnorm(N, mean = 0, sd = 1), number.timepoints),
'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
stringsAsFactors=F)
# Design Matrix
X <- model.matrix( ~ Group , data = dat)
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.2, 1)
# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)
theta <- 1 # dispersion parameter
dat$theta <- as.vector(theta)
upsilon <- mu + (mu^2)/theta
dat$upsilon <- as.vector(upsilon)
prob <- theta/(theta + mu)
dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
hist.nb <- hist(dat$Y_nb, plot = F)
hist.nb$counts <- 100*hist.nb$counts/N
plot(hist.nb, ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Negative Binomial')
dat$Y_pois <- rpois(n = N, lambda = mu)
hist.pois <- hist(dat$Y_pois, plot = F)
hist.pois$counts <- 100*hist.pois$counts/N
plot(hist.pois, ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Poisson')
# Poisson:
# Mean
aggregate(Y_pois ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)
# Variance
aggregate(Y_pois ~ Group, FUN = var, data = dat)
# Negative Binomial:
# Mean:
aggregate(Y_nb ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)
# Variance:
aggregate(Y_nb ~ Group, FUN = var, data = dat)
aggregate(upsilon ~ Group, FUN = mean, data = dat)
# Fit Models:
mod.pois <- glm(Y_pois ~ Group, data = dat, family = 'poisson')
summary(mod.pois)
mod.nb <- MASS::glm.nb(Y_nb ~ Group, data = dat)
summary(mod.nb)
# Fit Models using glmmTMB R package:
# Poisson
mp <- glmmTMB(Y_pois ~ Group, family=poisson, data=dat)
summary(mp)
# NB
mnp <- glmmTMB(Y_nb ~ Group, family=nbinom2, data=dat)
summary(mnp)
This R code helps to illustrate the relationship between the two distributions.
library(SPR)
library(MASS)
library(ggplot2)
library(gridExtra)
N = 1e5 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(4202021)
dat <- data.frame(
'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
#'Y_nb' = rep(NA, N*number.timepoints),
#'Y_pois' = rep(NA, N*number.timepoints),
#'Bio' = rep(rnorm(N, mean = 0, sd = 1), number.timepoints),
'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
stringsAsFactors=F)
# Design Matrix
X <- model.matrix( ~ Group , data = dat)
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.2, 1)
# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)
for (theta in c(1000, 100, 10, 1, 0.1)) {
#theta <- 1000# dispersion parameter
dat$theta <- as.vector(theta)
upsilon <- mu + (mu^2)/theta
dat$upsilon <- as.vector(upsilon)
prob <- theta/(theta + mu)
# Generate Data:
dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
dat$Y_pois <- rpois(n = N, lambda = mu)
p1 <- ggplot2::ggplot(dat, aes(x = Y_nb)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) +
labs(subtitle = paste0('Theta = ', theta), title='Negative Binomial', x ="Value", y = "Percent") +
ylim(c(0, 25)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
p2 <- ggplot2::ggplot(dat, aes(x = Y_pois)) +
geom_bar(aes(y = 100*(..count..)/sum(..count..))) +
labs(subtitle = paste0('Theta = ', theta), title='Poisson', x ="Value", y = "Percent") +
ylim(c(0, 50)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
gridExtra::grid.arrange(p1, p2, ncol = 2)
# Poisson:
# Mean
aggregate(Y_pois ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)
# Variance
aggregate(Y_pois ~ Group, FUN = var, data = dat)
# Negative Binomial:
# Mean:
aggregate(Y_nb ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)
# Variance:
aggregate(Y_nb ~ Group, FUN = var, data = dat)
aggregate(upsilon ~ Group, FUN = mean, data = dat)
}
Custom estimation code helps to understand how the model is estimated.
# Custom written estimation routine:
Neg_Binom_loglike <- function(vP, dat, Y){
# the 'par' (parameters to estimate) are only passed as a vector
# re-create what you need from that vector
Beta <- vP[c('b0', 'b1')]
Beta <- matrix(Beta, ncol = 1)
size <- vP['theta']
X <- model.matrix( ~ Group , data = dat)
Y <- dat[ , Y]
# Parameters:
mu <- exp(X %*% Beta)
prob <- size/(size + mu)
loglike <- lgamma(Y + size) - lgamma(size) - lgamma(Y + 1) + size*log(prob) + (Y)*log(1-prob)
loglike <- -1*loglike
loglike <- sum(loglike)
return(loglike)
}
# optimize
Beta; theta # Gen param
vP <- c('b0' = 0.2, 'b1' = 1, 'theta' = 1)
out <- optim(par = vP, fn = Neg_Binom_loglike, method = 'BFGS',
dat = dat, Y = 'Y_nb')
# Compare Generating Parameter to R package estimate and the custom code estimate
cbind('Gen Param' = c(as.vector(Beta), theta), 'Custom' = out$par, 'R package' = c(coef(mod.nb), mod.nb$theta))
# Custom written estimation routine:
Poisson_loglike <- function(vP, dat, Y){
# the 'par' (parameters to estimate) are only passed as a vector
# re-create what you need from that vector
Beta <- vP[c('b0', 'b1')]
Beta <- matrix(Beta, ncol = 1)
X <- model.matrix( ~ Group , data = dat)
Y <- dat[ , Y]
# Parameters:
mu <- exp(X %*% Beta)
loglike <- Y * log(mu) - mu - lgamma(Y + 1)
loglike <- -1*loglike
loglike <- sum(loglike)
return(loglike)
}
# optimize
vP <- c('b0' = 0, 'b1' = 0.5)
start <- Sys.time()
out <- optim(par = vP, fn = Poisson_loglike, method = 'BFGS',
dat = dat, Y = 'Y_pois')
end <- Sys.time()
end - start
# Compare Generating Parameter to R package estimate and the custom code estimate
cbind('**Gen Param**' = c(as.vector(Beta)), '**Custom**' = out$par, '**R package**' = c(coef(mod.pois)))
Conduct a simulation study with a unique set of conditions and compute the Type I error and Power. Remember to use the save()
and load()
functions. Use the R code provided below.
library(MASS)
N = 50 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(2012021)
dat <- data.frame(
'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
stringsAsFactors=F)
# Design Matrix
X <- model.matrix( ~ Group , data = dat)
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
#Beta[] <- c(0.2, 0) # Type I error
Beta[] <- c(0.2, 1) # Power
# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)
theta <- 0.5 # dispersion parameter
dat$theta <- as.vector(theta)
upsilon <- mu + (mu^2)/theta
dat$upsilon <- as.vector(upsilon)
#######
# Simulation
out <- vector()
for(repl in 1:1000){
# Generate Data:
dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
#dat$Y_pois <- rpois(n = N, lambda = mu)
# Fit Models -both Poisson and NB to data that is NB
mod.pois <- glm(Y_nb ~ Group, data = dat, family = 'poisson')
mod.nb <- MASS::glm.nb(Y_nb ~ Group, data = dat)
out <- rbind(out, c(
summary(mod.pois)$coef['GroupGroup_2', 'Pr(>|z|)'],
summary(mod.nb)$coef['GroupGroup_2', 'Pr(>|z|)']
))
cat(paste0('Replication: ', repl, '\n'))
}
colMeans(out < 0.05)