Poisson

In this tutorial, we will see how to use Poisson and negative binomial regression. In practice, we use Poisson regression to model a count outcome. Note that we assume the mean is equal to the variance in Poisson. When the variance is greater than what’s assumed by the model, overdispersion occurs. Poisson regression of overdispersed data leads to under-estimated or deflated standard errors, which leads to inflated test statistics and overly small (anti-conservative) p-values, increasing false positives. We can use negative binomial regression to model overdispersed data.

# Load required packages
knitr::opts_chunk$set(echo = TRUE)
require(Publish)
require(survey)

Data

# Load
load("Data/nonbinary/OAcvd.RData")

# Survey weight
analytic2$weight <- analytic2$weight/3

# Make fruit.cont as a numeric variable
analytic2$fruit.cont <- as.numeric(as.character(analytic2$fruit.cont))

# Make fruit.cont as a integer/count variable
analytic2$fruit.cont <- floor(analytic2$fruit.cont) # round

# Factor variables using lapply
var.names <- c("age", "sex", "income", "race", "bmicat", "phyact", "smoke",
               "fruit", "painmed", "ht", "copd", "diab", "edu", "CVD", "OA")
analytic2[var.names] <- lapply(analytic2[var.names] , factor)
str(analytic2)
#> 'data.frame':    21623 obs. of  17 variables:
#>  $ CVD       : Factor w/ 2 levels "0 event","event": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ age       : Factor w/ 4 levels "20-39 years",..: 2 3 4 1 3 1 3 1 3 3 ...
#>  $ sex       : Factor w/ 2 levels "Female","Male": 2 1 1 1 2 1 2 1 1 2 ...
#>  $ income    : Factor w/ 4 levels "$29,999 or less",..: 3 4 1 2 1 2 2 3 3 4 ...
#>  $ race      : Factor w/ 2 levels "Non-white","White": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ bmicat    : Factor w/ 3 levels "Normal","Overweight",..: 1 1 2 2 2 1 2 1 2 2 ...
#>  $ phyact    : Factor w/ 3 levels "Active","Inactive",..: 3 1 1 2 1 3 2 3 2 3 ...
#>  $ smoke     : Factor w/ 3 levels "Current smoker",..: 2 3 3 2 2 2 2 2 3 2 ...
#>  $ fruit     : Factor w/ 3 levels "0-3 daily serving",..: 2 2 3 2 1 3 1 3 2 2 ...
#>  $ painmed   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
#>  $ ht        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ copd      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ diab      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ edu       : Factor w/ 4 levels "< 2ndary","2nd grad.",..: 2 2 1 4 4 4 1 4 4 1 ...
#>  $ weight    : num  4.8 13.29 4.43 11.1 9.61 ...
#>  $ OA        : Factor w/ 2 levels "Control","OA": 2 1 2 1 1 1 2 1 1 1 ...
#>  $ fruit.cont: num  3 4 8 3 2 10 1 8 5 3 ...
#>  - attr(*, "na.action")= 'omit' Named int [1:219757] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..- attr(*, "names")= chr [1:219757] "3" "5" "7" "9" ...

Survey weighted summary

# Survey design
w.design <- svydesign(id=~1, weights=~weight, data=analytic2)

# Cross-tabulation
xtabs(~fruit.cont, analytic2)
#> fruit.cont
#>    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
#>  407 1628 3304 4261 3759 2887 1980 1288  809  480  303  174  109   80   51   25 
#>   16   17   18   19   20   21   22   23   24   25   26   27   29   42 
#>   21   14    9    4    5    4    2    6    7    1    2    1    1    1
svytable(~fruit.cont, w.design)
#> fruit.cont
#>            0            1            2            3            4            5 
#>   9085.00222  39974.10444  88216.88222 119206.45778 108665.00556  82149.23111 
#>            6            7            8            9           10           11 
#>  53791.70444  35463.49111  22593.73000  13023.83556   8878.24778   4520.85000 
#>           12           13           14           15           16           17 
#>   3476.42667   2210.76000   1911.70556    973.93444    571.37667    414.44444 
#>           18           19           20           21           22           23 
#>    270.82556     77.76222    253.13000     24.74778     34.17444    185.70778 
#>           24           25           26           27           29           42 
#>    154.67778    109.88444    149.81222     89.61444     14.91111     93.76889
svyhist(~fruit.cont, w.design)

svyby(~fruit.cont, ~phyact, w.design, svymean, deff = TRUE)

Poisson regression

Let us fit the traditional (not design-adjusted) Poisson regression using the glm function:

require(jtools)
#> Loading required package: jtools
analytic2$phyact2=relevel(analytic2$phyact, ref ="Inactive")

# Poisson regression - crude
fit1 <- glm(fruit.cont ~phyact2, data=analytic2, family=poisson)
summ(fit1, confint = TRUE, digits = 3)
Observations 21623
Dependent variable fruit.cont
Type Generalized linear model
Family poisson
Link log
χ²(2) 1219.347
p 0.000
Pseudo-R² (Cragg-Uhler) 0.055
Pseudo-R² (McFadden) 0.012
AIC 98570.197
BIC 98594.142
Est. 2.5% 97.5% z val. p
(Intercept) 1.337 1.328 1.347 267.387 0.000
phyact2Active 0.274 0.259 0.289 34.983 0.000
phyact2Moderate 0.136 0.120 0.152 16.741 0.000
Standard errors: MLE

# Poisson regression - adjusted for covariates
fit2 <- glm(fruit.cont ~ phyact2 + age + sex + 
               income + race + bmicat + 
               smoke + edu, data=analytic2, family=poisson)
summ(fit2, confint = TRUE, digits = 3, vifs = TRUE)
Observations 21623
Dependent variable fruit.cont
Type Generalized linear model
Family poisson
Link log
χ²(17) 2984.005
p 0.000
Pseudo-R² (Cragg-Uhler) 0.130
Pseudo-R² (McFadden) 0.030
AIC 96835.540
BIC 96979.207
Est. 2.5% 97.5% z val. p VIF
(Intercept) 1.156 1.123 1.189 68.773 0.000 NA
phyact2Active 0.250 0.235 0.266 31.432 0.000 1.040
phyact2Moderate 0.112 0.095 0.128 13.640 0.000 1.040
age40-49 years 0.027 0.010 0.043 3.196 0.001 1.102
age50-59 years 0.083 0.066 0.100 9.400 0.000 1.102
age60-64 years 0.127 0.103 0.151 10.287 0.000 1.102
sexMale -0.173 -0.187 -0.160 -25.108 0.000 1.082
income$30,000-$49,999 0.039 0.017 0.060 3.557 0.000 1.144
income$50,000-$79,999 0.050 0.030 0.070 4.923 0.000 1.144
income$80,000 or more 0.083 0.062 0.103 7.964 0.000 1.144
raceWhite -0.003 -0.024 0.018 -0.240 0.811 1.077
bmicatOverweight -0.021 -0.035 -0.008 -3.054 0.002 1.096
bmicatUnderweight -0.001 -0.034 0.033 -0.030 0.976 1.096
smokeFormer smoker 0.131 0.114 0.148 15.050 0.000 1.132
smokeNever smoker 0.181 0.163 0.199 19.436 0.000 1.132
edu2nd grad. 0.040 0.016 0.064 3.291 0.001 1.125
eduOther 2nd grad. 0.051 0.020 0.083 3.201 0.001 1.125
eduPost-2nd grad. 0.122 0.101 0.144 11.219 0.000 1.125
Standard errors: MLE

Survey weighted Poisson regression

Now, let’s fit the design-adjusted Poisson regression using the svyglm function:

require(jtools)
# Design
w.design <- update (w.design , phyact2=relevel(phyact, ref ="Inactive"))

# Design-adjusted Poisson - crude
fit1 <- svyglm(fruit.cont ~phyact2, design=w.design, family=poisson)
summ(fit1, confint = TRUE, digits = 3)
Observations 21623
Dependent variable fruit.cont
Type Survey-weighted generalized linear model
Family poisson
Link log
Pseudo-R² (Cragg-Uhler) 0.064
Pseudo-R² (McFadden) 0.035
AIC 99087.436
Est. 2.5% 97.5% t val. p
(Intercept) 1.373 1.354 1.391 146.410 0.000
phyact2Active 0.261 0.231 0.292 16.877 0.000
phyact2Moderate 0.124 0.095 0.154 8.230 0.000
Standard errors: Robust

# Design-adjusted Poisson - adjusted for covariates
fit2<-svyglm(fruit.cont ~phyact2 + age + sex + 
               income + race + bmicat + 
               smoke + edu, design=w.design, family=poisson)
summ(fit2, confint = TRUE, digits = 3, vifs = TRUE)
Observations 21623
Dependent variable fruit.cont
Type Survey-weighted generalized linear model
Family poisson
Link log
Pseudo-R² (Cragg-Uhler) 0.137
Pseudo-R² (McFadden) 0.076
AIC 97795.011
Est. 2.5% 97.5% t val. p VIF
(Intercept) 1.175 1.113 1.237 37.230 0.000 NA
phyact2Active 0.241 0.210 0.272 15.211 0.000 1.346
phyact2Moderate 0.105 0.074 0.135 6.775 0.000 1.346
age40-49 years 0.041 0.010 0.071 2.623 0.009 1.447
age50-59 years 0.102 0.071 0.133 6.402 0.000 1.447
age60-64 years 0.149 0.096 0.203 5.461 0.000 1.447
sexMale -0.136 -0.161 -0.111 -10.500 0.000 1.148
income$30,000-$49,999 0.025 -0.015 0.066 1.217 0.224 1.294
income$50,000-$79,999 0.031 -0.008 0.070 1.559 0.119 1.294
income$80,000 or more 0.057 0.018 0.096 2.874 0.004 1.294
raceWhite 0.023 -0.012 0.059 1.289 0.197 1.193
bmicatOverweight -0.009 -0.035 0.017 -0.708 0.479 1.331
bmicatUnderweight 0.029 -0.033 0.091 0.928 0.353 1.331
smokeFormer smoker 0.094 0.059 0.128 5.352 0.000 1.474
smokeNever smoker 0.140 0.102 0.177 7.290 0.000 1.474
edu2nd grad. 0.026 -0.020 0.072 1.093 0.274 1.302
eduOther 2nd grad. 0.033 -0.024 0.090 1.136 0.256 1.302
eduPost-2nd grad. 0.129 0.085 0.173 5.807 0.000 1.302
Standard errors: Robust

Negative binomial regression

Let’s fit the negative binomial regression model using the MASS::glm.nb function, which estimates the dispersion parameter (theta, \(\theta\)) from the data rather than fixing it. In the negative binomial model the variance is \(\text{Var}(Y) = \mu + \mu^2/\theta\), so the variance exceeds the mean whenever \(\theta\) is finite, allowing the model to accommodate overdispersion. As \(\theta \to \infty\) the variance approaches \(\mu\) and the model reduces to Poisson. We report the estimated \(\theta\) alongside the rate ratios.

require(MASS)
analytic2$phyact2=relevel(analytic2$phyact, ref ="Inactive")

# Negative binomial regression - crude
fit3<- glm.nb(fruit.cont ~ phyact2, data=analytic2)
fit3$theta # estimated dispersion parameter (theta)
#> [1] 10.27349
round(exp(cbind(coef(fit3), confint(fit3))),2)
#> Waiting for profiling to be done...
#>                      2.5 % 97.5 %
#> (Intercept)     3.81  3.77   3.85
#> phyact2Active   1.32  1.29   1.34
#> phyact2Moderate 1.15  1.12   1.17

# Negative binomial regression - adjusted for covariates
fit4<-glm.nb(fruit.cont ~phyact2 + age + sex + income + race + bmicat + smoke + edu, data=analytic2)
fit4$theta # estimated dispersion parameter (theta)
#> [1] 12.53347
round(exp(cbind(coef(fit4), confint(fit4))),2)
#> Waiting for profiling to be done...
#>                            2.5 % 97.5 %
#> (Intercept)           3.18  3.06   3.30
#> phyact2Active         1.29  1.26   1.31
#> phyact2Moderate       1.12  1.10   1.14
#> age40-49 years        1.03  1.01   1.05
#> age50-59 years        1.09  1.06   1.11
#> age60-64 years        1.14  1.10   1.17
#> sexMale               0.84  0.83   0.85
#> income$30,000-$49,999 1.04  1.02   1.07
#> income$50,000-$79,999 1.05  1.03   1.08
#> income$80,000 or more 1.09  1.06   1.11
#> raceWhite             1.00  0.97   1.02
#> bmicatOverweight      0.98  0.96   0.99
#> bmicatUnderweight     1.00  0.96   1.04
#> smokeFormer smoker    1.14  1.12   1.16
#> smokeNever smoker     1.20  1.17   1.22
#> edu2nd grad.          1.04  1.01   1.07
#> eduOther 2nd grad.    1.05  1.01   1.09
#> eduPost-2nd grad.     1.13  1.10   1.16

The exponentiated coefficients are rate ratios (IRRs). For example, the phyact2 rate ratio compares the expected fruit consumption count for the corresponding physical-activity level relative to the inactive reference group, holding other covariates fixed.

Survey weighted negative binomial regression

Now, let’s fit the design-adjusted negative binomial regression model. The svyglm.nb fit returns the estimated dispersion parameter (theta) together with the mean-model coefficients (prefixed with eta.). We report theta separately and exponentiate only the mean-model (eta.) coefficients to obtain rate ratios; the theta.(Intercept) row is not a rate ratio and should not be exponentiated.

require(sjstats)

# Helper: exponentiate only the mean-model (eta.) coefficients as rate ratios
nb_irr <- function(fit){
  est <- cbind(coef(fit), confint(fit))
  eta.rows <- grepl("^eta\\.", rownames(est))
  round(exp(est[eta.rows, , drop = FALSE]), 2)
}

# Design-adjusted negative binomial - crude
fit3<- svyglm.nb(fruit.cont ~phyact2, design=w.design)
attr(fit3, "nb.theta") # estimated dispersion parameter (theta)
#> [1] 10.27019
nb_irr(fit3)
#>                          2.5 % 97.5 %
#> eta.(Intercept)     3.95  3.87   4.02
#> eta.phyact2Active   1.30  1.26   1.34
#> eta.phyact2Moderate 1.13  1.10   1.17

# Design-adjusted negative binomial - adjusted for covariates
fit4<-svyglm.nb(fruit.cont ~phyact2 + age + sex + income + race + bmicat + smoke + edu,
                design=w.design)
attr(fit4, "nb.theta") # estimated dispersion parameter (theta)
#> [1] 11.79313
nb_irr(fit4)
#>                                2.5 % 97.5 %
#> eta.(Intercept)           3.24  3.04   3.44
#> eta.phyact2Active         1.27  1.23   1.31
#> eta.phyact2Moderate       1.11  1.08   1.14
#> eta.age40-49 years        1.04  1.01   1.07
#> eta.age50-59 years        1.11  1.07   1.14
#> eta.age60-64 years        1.16  1.10   1.22
#> eta.sexMale               0.87  0.85   0.90
#> eta.income$30,000-$49,999 1.03  0.99   1.07
#> eta.income$50,000-$79,999 1.03  0.99   1.07
#> eta.income$80,000 or more 1.06  1.02   1.10
#> eta.raceWhite             1.02  0.99   1.06
#> eta.bmicatOverweight      0.99  0.97   1.02
#> eta.bmicatUnderweight     1.03  0.97   1.09
#> eta.smokeFormer smoker    1.10  1.06   1.14
#> eta.smokeNever smoker     1.15  1.11   1.19
#> eta.edu2nd grad.          1.03  0.98   1.07
#> eta.eduOther 2nd grad.    1.03  0.98   1.09
#> eta.eduPost-2nd grad.     1.14  1.09   1.19

As above, these exponentiated eta. coefficients are design-adjusted rate ratios (IRRs); a rate ratio above 1 indicates higher expected counts relative to the inactive reference group, and a value below 1 indicates lower expected counts.

Video content (optional)

Tip

For those who prefer a video walkthrough, feel free to watch the video below, which offers a description of an earlier version of the above content.