Poisson and Negative Binomial Distributions • SPR

Reference for Negative Binomial Models

Ben bolker, one of the main creators of the glmmTMB R package, wrote this: https://ms.mcmaster.ca/~bolker/emdbook/book.pdf

I also think you should own Advanced Statistics with Application in R by Demidenko

Links:

Relationship between Negative Binomial and Poisson

Illustrate how the Negative Binomial distribution

Changes shape as mu changes
Changes shape as theta changes
When does NB approach Poisson distribution? Please show multiple plots together to visualize the change

Simulation Study

Conduct a simulation study with a unique set of conditions and compute the Type I error and Power. Remember to use the save() and load() functions. Use the R code provided below.

Generate cross sectional data

library(SPR)
library(MASS)

N = 1e5 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(2012021)

dat <- data.frame(
                  'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
                  'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
                  #'Y_nb' = rep(NA, N*number.timepoints), 
                  #'Y_pois' = rep(NA, N*number.timepoints), 
                  #'Bio' = rep(rnorm(N, mean = 0, sd = 1), number.timepoints),
                  'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
                  stringsAsFactors=F)


# Design Matrix
X <- model.matrix( ~ Group , data = dat) 
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.2, 1)

# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)

theta <- 1 # dispersion parameter
dat$theta <- as.vector(theta)

upsilon <- mu + (mu^2)/theta
dat$upsilon <- as.vector(upsilon)

prob <- theta/(theta + mu)

dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
  hist.nb <- hist(dat$Y_nb, plot = F)
  hist.nb$counts <- 100*hist.nb$counts/N
  plot(hist.nb, ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Negative Binomial')


dat$Y_pois <- rpois(n = N, lambda = mu)
  hist.pois <- hist(dat$Y_pois, plot = F)
  hist.pois$counts <- 100*hist.pois$counts/N
  plot(hist.pois, ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Poisson')

# Poisson:
# Mean
aggregate(Y_pois ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)
# Variance
aggregate(Y_pois ~ Group, FUN = var, data = dat)

# Negative Binomial:
# Mean:
aggregate(Y_nb ~ Group, FUN = mean, data = dat)
aggregate(mu ~ Group, FUN = mean, data = dat)

# Variance:
aggregate(Y_nb ~ Group, FUN = var, data = dat)
aggregate(upsilon ~ Group, FUN = mean, data = dat)


# Fit Models:

mod.pois <- glm(Y_pois ~ Group, data = dat, family = 'poisson')
summary(mod.pois)

mod.nb <- MASS::glm.nb(Y_nb ~ Group, data = dat)
summary(mod.nb)


# Fit Models using glmmTMB R package:

# Poisson
mp <- glmmTMB(Y_pois ~ Group, family=poisson, data=dat)
summary(mp)

# NB
mnp <- glmmTMB(Y_nb ~ Group, family=nbinom2, data=dat)
summary(mnp)

Plots

This R code helps to illustrate the relationship between the two distributions.

library(SPR)
library(MASS)

library(ggplot2)
library(gridExtra)

N = 1e5 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(4202021)

dat <- data.frame(
                  'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
                  'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
                  #'Y_nb' = rep(NA, N*number.timepoints), 
                  #'Y_pois' = rep(NA, N*number.timepoints), 
                  #'Bio' = rep(rnorm(N, mean = 0, sd = 1), number.timepoints),
                  'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
                  stringsAsFactors=F)


# Design Matrix
X <- model.matrix( ~ Group , data = dat) 
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.2, 1)

# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)



for (theta in c(1000, 100, 10, 1, 0.1)) {
  
  #theta <- 1000# dispersion parameter
  dat$theta <- as.vector(theta)

  upsilon <- mu + (mu^2)/theta
  dat$upsilon <- as.vector(upsilon)

  prob <- theta/(theta + mu)

 # Generate Data:
  dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
  dat$Y_pois <- rpois(n = N, lambda = mu)

  p1 <- ggplot2::ggplot(dat, aes(x = Y_nb)) + 
            geom_bar(aes(y = 100*(..count..)/sum(..count..))) +
            labs(subtitle = paste0('Theta = ', theta), title='Negative Binomial', x ="Value", y = "Percent") +
            ylim(c(0, 25)) + 
            theme_minimal() + 
            theme(plot.title = element_text(hjust = 0.5), 
                  plot.subtitle = element_text(hjust = 0.5)) 

  
  p2 <- ggplot2::ggplot(dat, aes(x = Y_pois)) + 
            geom_bar(aes(y = 100*(..count..)/sum(..count..))) +
            labs(subtitle = paste0('Theta = ', theta), title='Poisson', x ="Value", y = "Percent") +
            ylim(c(0, 50)) + 
            theme_minimal() + 
            theme(plot.title = element_text(hjust = 0.5), 
                  plot.subtitle = element_text(hjust = 0.5)) 

  
  gridExtra::grid.arrange(p1, p2, ncol = 2)   


  # Poisson:
  # Mean
  aggregate(Y_pois ~ Group, FUN = mean, data = dat)
  aggregate(mu ~ Group, FUN = mean, data = dat)
  # Variance
  aggregate(Y_pois ~ Group, FUN = var, data = dat)
  
  # Negative Binomial:
  # Mean:
  aggregate(Y_nb ~ Group, FUN = mean, data = dat)
  aggregate(mu ~ Group, FUN = mean, data = dat)

  # Variance:
  aggregate(Y_nb ~ Group, FUN = var, data = dat)
  aggregate(upsilon ~ Group, FUN = mean, data = dat)

}

Custom Estimation Routine

Custom estimation code helps to understand how the model is estimated.

# Custom written estimation routine:
Neg_Binom_loglike <- function(vP, dat, Y){
  
  # the 'par' (parameters to estimate) are only passed as a vector
  # re-create what you need from that vector
  Beta <- vP[c('b0', 'b1')]
  Beta <- matrix(Beta, ncol = 1)
  size <- vP['theta']

  X <- model.matrix( ~ Group , data = dat) 
  Y <- dat[ , Y]
  # Parameters:
  mu <- exp(X %*% Beta)
  prob <- size/(size + mu)
  
  loglike <- lgamma(Y + size) - lgamma(size) - lgamma(Y + 1) + size*log(prob) + (Y)*log(1-prob)
  loglike <- -1*loglike
  loglike <- sum(loglike)
  return(loglike)
  
}


# optimize
Beta; theta # Gen param
vP <- c('b0' = 0.2, 'b1' = 1, 'theta' = 1)
out <- optim(par = vP, fn = Neg_Binom_loglike, method = 'BFGS', 
             dat = dat, Y = 'Y_nb')
# Compare Generating Parameter to R package estimate and the custom code estimate

cbind('Gen Param' = c(as.vector(Beta), theta), 'Custom' = out$par, 'R package' = c(coef(mod.nb), mod.nb$theta))


# Custom written estimation routine:
Poisson_loglike <- function(vP, dat, Y){
  
  # the 'par' (parameters to estimate) are only passed as a vector
  # re-create what you need from that vector
  Beta <- vP[c('b0', 'b1')]
  Beta <- matrix(Beta, ncol = 1)

  X <- model.matrix( ~ Group , data = dat) 
  Y <- dat[ , Y]
  # Parameters:
  mu <- exp(X %*% Beta)

  loglike <-  Y * log(mu) - mu - lgamma(Y + 1)
  loglike <- -1*loglike
  loglike <- sum(loglike)
  return(loglike)
  
}


# optimize
vP <- c('b0' = 0, 'b1' = 0.5)
  start <- Sys.time()
out <- optim(par = vP, fn = Poisson_loglike, method = 'BFGS', 
             dat = dat, Y = 'Y_pois')
  end <- Sys.time()

  end - start
# Compare Generating Parameter to R package estimate and the custom code estimate
cbind('**Gen Param**' = c(as.vector(Beta)), '**Custom**' = out$par, '**R package**' = c(coef(mod.pois)))

Simulation Study: Poisson and Negative Binomial Models

Evaluate Type I error and Power

Conduct a simulation study with a unique set of conditions and compute the Type I error and Power. Remember to use the save() and load() functions. Use the R code provided below.

library(MASS)

N = 50 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(2012021)

dat <- data.frame(
                  'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
                  'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
                  'Time' = rep(paste0('Time_', 1:number.timepoints), each = N),
                  stringsAsFactors=F)


# Design Matrix
X <- model.matrix( ~ Group , data = dat) 
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
#Beta[] <- c(0.2, 0)  # Type I error
Beta[] <- c(0.2, 1)  # Power

# Parameters:
mu <- exp(X %*% Beta)
dat$mu <- as.vector(mu)

theta <- 0.5 # dispersion parameter
dat$theta <- as.vector(theta)

upsilon <- mu + (mu^2)/theta
dat$upsilon <- as.vector(upsilon)


#######
# Simulation
out <- vector()

for(repl in 1:1000){
  
  # Generate Data:
    dat$Y_nb <- rnbinom(n = N, size = theta, mu = mu)
    #dat$Y_pois <- rpois(n = N, lambda = mu)
 

  # Fit Models -both Poisson and NB to data that is NB
    mod.pois <- glm(Y_nb ~ Group, data = dat, family = 'poisson')
    mod.nb <- MASS::glm.nb(Y_nb ~ Group, data = dat)

    out <- rbind(out, c( 
                 summary(mod.pois)$coef['GroupGroup_2', 'Pr(>|z|)'], 
                 summary(mod.nb)$coef['GroupGroup_2', 'Pr(>|z|)']
    ))
  
  cat(paste0('Replication: ', repl, '\n'))
  
  }

colMeans(out < 0.05)