Poisson

In this tutorial, we will see how to use Poisson and negative binomial regression. In practice, we use Poisson regression to model a count outcome. Note that we assume the mean is equal to the variance in Poisson. When the variance is greater than what’s assumed by the model, overdispersion occurs. Poisson regression of overdispersed data leads to under-estimated or deflated standard errors, which leads to inflated test statistics and p-values. We can use negative binomial regression to model overdispersed data.

# Load required packages
knitr::opts_chunk$set(echo = TRUE)
require(Publish)
require(survey)

Data

# Load
load("Data/nonbinary/OAcvd.RData")

# Survey weight
analytic2$weight <- analytic2$weight/3

# Make fruit.cont as a numeric variable
analytic2$fruit.cont <- as.numeric(as.character(analytic2$fruit.cont))

# Make fruit.cont as a integer/count variable
analytic2$fruit.cont <- floor(analytic2$fruit.cont) # round

# Factor variables using lapply
var.names <- c("age", "sex", "income", "race", "bmicat", "phyact", "smoke",
               "fruit", "painmed", "ht", "copd", "diab", "edu", "CVD", "OA")
analytic2[var.names] <- lapply(analytic2[var.names] , factor)
str(analytic2)
#> 'data.frame':    21623 obs. of  17 variables:
#>  $ CVD       : Factor w/ 2 levels "0 event","event": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ age       : Factor w/ 4 levels "20-39 years",..: 2 3 4 1 3 1 3 1 3 3 ...
#>  $ sex       : Factor w/ 2 levels "Female","Male": 2 1 1 1 2 1 2 1 1 2 ...
#>  $ income    : Factor w/ 4 levels "$29,999 or less",..: 3 4 1 2 1 2 2 3 3 4 ...
#>  $ race      : Factor w/ 2 levels "Non-white","White": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ bmicat    : Factor w/ 3 levels "Normal","Overweight",..: 1 1 2 2 2 1 2 1 2 2 ...
#>  $ phyact    : Factor w/ 3 levels "Active","Inactive",..: 3 1 1 2 1 3 2 3 2 3 ...
#>  $ smoke     : Factor w/ 3 levels "Current smoker",..: 2 3 3 2 2 2 2 2 3 2 ...
#>  $ fruit     : Factor w/ 3 levels "0-3 daily serving",..: 2 2 3 2 1 3 1 3 2 2 ...
#>  $ painmed   : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 1 ...
#>  $ ht        : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ copd      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ diab      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ edu       : Factor w/ 4 levels "< 2ndary","2nd grad.",..: 2 2 1 4 4 4 1 4 4 1 ...
#>  $ weight    : num  4.8 13.29 4.43 11.1 9.61 ...
#>  $ OA        : Factor w/ 2 levels "Control","OA": 2 1 2 1 1 1 2 1 1 1 ...
#>  $ fruit.cont: num  3 4 8 3 2 10 1 8 5 3 ...
#>  - attr(*, "na.action")= 'omit' Named int [1:219757] 1 2 3 4 5 6 7 8 9 10 ...
#>   ..- attr(*, "names")= chr [1:219757] "3" "5" "7" "9" ...

Survey weighted summary

# Survey design
w.design <- svydesign(id=~1, weights=~weight, data=analytic2)

# Cross-tabulation
xtabs(~fruit.cont, analytic2)
#> fruit.cont
#>    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
#>  407 1628 3304 4261 3759 2887 1980 1288  809  480  303  174  109   80   51   25 
#>   16   17   18   19   20   21   22   23   24   25   26   27   29   42 
#>   21   14    9    4    5    4    2    6    7    1    2    1    1    1
svytable(~fruit.cont, w.design)
#> fruit.cont
#>            0            1            2            3            4            5 
#>   9085.00222  39974.10444  88216.88222 119206.45778 108665.00556  82149.23111 
#>            6            7            8            9           10           11 
#>  53791.70444  35463.49111  22593.73000  13023.83556   8878.24778   4520.85000 
#>           12           13           14           15           16           17 
#>   3476.42667   2210.76000   1911.70556    973.93444    571.37667    414.44444 
#>           18           19           20           21           22           23 
#>    270.82556     77.76222    253.13000     24.74778     34.17444    185.70778 
#>           24           25           26           27           29           42 
#>    154.67778    109.88444    149.81222     89.61444     14.91111     93.76889
svyhist(~fruit.cont, w.design)

svyby(~fruit.cont, ~phyact, w.design, svymean, deff = TRUE)

Poisson regression

Let us fit the traditional (not design-adjusted) Poisson regression using the glm function:

require(jtools)
analytic2$phyact2=relevel(analytic2$phyact, ref ="Inactive")

# Poisson regression - crude
fit1 <- glm(fruit.cont ~phyact2, data=analytic2, family=poisson)
summ(fit1, confint = TRUE, digits = 3)

Observations	21623
Dependent variable	fruit.cont
Type	Generalized linear model
Family	poisson
Link	log

χ²(2)	1219.347
Pseudo-R² (Cragg-Uhler)	0.055
Pseudo-R² (McFadden)	0.012
AIC	98570.197
BIC	98594.142

	Est.	2.5%	97.5%	z val.	p
(Intercept)	1.337	1.328	1.347	267.387	0.000
phyact2Active	0.274	0.259	0.289	34.983	0.000
phyact2Moderate	0.136	0.120	0.152	16.741	0.000
Standard errors: MLE


# Poisson regression - adjusted for covariates
fit2 <- glm(fruit.cont ~ phyact2 + age + sex + 
               income + race + bmicat + 
               smoke + edu, data=analytic2, family=poisson)
summ(fit2, confint = TRUE, digits = 3, vifs = TRUE)

Observations	21623
Dependent variable	fruit.cont
Type	Generalized linear model
Family	poisson
Link	log

χ²(17)	2984.005
Pseudo-R² (Cragg-Uhler)	0.130
Pseudo-R² (McFadden)	0.030
AIC	96835.540
BIC	96979.207

	Est.	2.5%	97.5%	z val.	p	VIF
(Intercept)	1.156	1.123	1.189	68.773	0.000	NA
phyact2Active	0.250	0.235	0.266	31.432	0.000	1.040
phyact2Moderate	0.112	0.095	0.128	13.640	0.000	1.040
age40-49 years	0.027	0.010	0.043	3.196	0.001	1.102
age50-59 years	0.083	0.066	0.100	9.400	0.000	1.102
age60-64 years	0.127	0.103	0.151	10.287	0.000	1.102
sexMale	-0.173	-0.187	-0.160	-25.108	0.000	1.082
income$30,000-$49,999	0.039	0.017	0.060	3.557	0.000	1.144
income$50,000-$79,999	0.050	0.030	0.070	4.923	0.000	1.144
income$80,000 or more	0.083	0.062	0.103	7.964	0.000	1.144
raceWhite	-0.003	-0.024	0.018	-0.240	0.811	1.077
bmicatOverweight	-0.021	-0.035	-0.008	-3.054	0.002	1.096
bmicatUnderweight	-0.001	-0.034	0.033	-0.030	0.976	1.096
smokeFormer smoker	0.131	0.114	0.148	15.050	0.000	1.132
smokeNever smoker	0.181	0.163	0.199	19.436	0.000	1.132
edu2nd grad.	0.040	0.016	0.064	3.291	0.001	1.125
eduOther 2nd grad.	0.051	0.020	0.083	3.201	0.001	1.125
eduPost-2nd grad.	0.122	0.101	0.144	11.219	0.000	1.125
Standard errors: MLE

Survey weighted Poisson regression

Now, let’s fit the design-adjusted Poisson regression using the svyglm function:

require(jtools)
# Design
w.design <- update (w.design , phyact2=relevel(phyact, ref ="Inactive"))

# Design-adjusted Poisson - crude
fit1 <- svyglm(fruit.cont ~phyact2, design=w.design, family=poisson)
summ(fit1, confint = TRUE, digits = 3)

Observations	21623
Dependent variable	fruit.cont
Type	Survey-weighted generalized linear model
Family	poisson
Link	log

Pseudo-R² (Cragg-Uhler)	0.064
Pseudo-R² (McFadden)	0.035
AIC	99087.436

	Est.	2.5%	97.5%	t val.	p
(Intercept)	1.373	1.354	1.391	146.410	0.000
phyact2Active	0.261	0.231	0.292	16.877	0.000
phyact2Moderate	0.124	0.095	0.154	8.230	0.000
Standard errors: Robust


# Design-adjusted Poisson - adjusted for covariates
fit2<-svyglm(fruit.cont ~phyact2 + age + sex + 
               income + race + bmicat + 
               smoke + edu, design=w.design, family=poisson)
summ(fit2, confint = TRUE, digits = 3, vifs = TRUE)

Observations	21623
Dependent variable	fruit.cont
Type	Survey-weighted generalized linear model
Family	poisson
Link	log

Pseudo-R² (Cragg-Uhler)	0.137
Pseudo-R² (McFadden)	0.076
AIC	97795.011

	Est.	2.5%	97.5%	t val.	p	VIF
(Intercept)	1.175	1.113	1.237	37.230	0.000	NA
phyact2Active	0.241	0.210	0.272	15.211	0.000	1.346
phyact2Moderate	0.105	0.074	0.135	6.775	0.000	1.346
age40-49 years	0.041	0.010	0.071	2.623	0.009	1.447
age50-59 years	0.102	0.071	0.133	6.402	0.000	1.447
age60-64 years	0.149	0.096	0.203	5.461	0.000	1.447
sexMale	-0.136	-0.161	-0.111	-10.500	0.000	1.148
income$30,000-$49,999	0.025	-0.015	0.066	1.217	0.224	1.294
income$50,000-$79,999	0.031	-0.008	0.070	1.559	0.119	1.294
income$80,000 or more	0.057	0.018	0.096	2.874	0.004	1.294
raceWhite	0.023	-0.012	0.059	1.289	0.197	1.193
bmicatOverweight	-0.009	-0.035	0.017	-0.708	0.479	1.331
bmicatUnderweight	0.029	-0.033	0.091	0.928	0.353	1.331
smokeFormer smoker	0.094	0.059	0.128	5.352	0.000	1.474
smokeNever smoker	0.140	0.102	0.177	7.290	0.000	1.474
edu2nd grad.	0.026	-0.020	0.072	1.093	0.274	1.302
eduOther 2nd grad.	0.033	-0.024	0.090	1.136	0.256	1.302
eduPost-2nd grad.	0.129	0.085	0.173	5.807	0.000	1.302
Standard errors: Robust

Negative binomial regression

Let’s fit the negative binomial regression model using the glm function. Below, we specify the dispersion parameter (theta) is equal to 1, which suggests that the ratio of mean and variance is assumed to be 1.

require(MASS)
analytic2$phyact2=relevel(analytic2$phyact, ref ="Inactive")

# Negative binomial regression - crude
fit3<- glm(fruit.cont ~ phyact2, data=analytic2, family = negative.binomial(theta = 1))
round(exp(cbind(coef(fit3), confint(fit3))),2)
#> Waiting for profiling to be done...
#>                      2.5 % 97.5 %
#> (Intercept)     3.81  3.76   3.85
#> phyact2Active   1.32  1.29   1.34
#> phyact2Moderate 1.15  1.12   1.17

# Negative binomial regression - adjusted for covariates
fit4<-glm(fruit.cont ~phyact2 + age + sex + income + race + bmicat + smoke + edu, data=analytic2,
          family = negative.binomial(theta = 1))
round(exp(cbind(coef(fit4), confint(fit4))),2)
#> Waiting for profiling to be done...
#>                            2.5 % 97.5 %
#> (Intercept)           3.17  3.05   3.30
#> phyact2Active         1.29  1.26   1.31
#> phyact2Moderate       1.12  1.10   1.14
#> age40-49 years        1.03  1.01   1.05
#> age50-59 years        1.08  1.06   1.11
#> age60-64 years        1.14  1.10   1.17
#> sexMale               0.84  0.83   0.86
#> income$30,000-$49,999 1.05  1.02   1.07
#> income$50,000-$79,999 1.06  1.03   1.08
#> income$80,000 or more 1.09  1.07   1.12
#> raceWhite             0.99  0.97   1.02
#> bmicatOverweight      0.98  0.96   1.00
#> bmicatUnderweight     0.99  0.95   1.04
#> smokeFormer smoker    1.14  1.12   1.16
#> smokeNever smoker     1.20  1.17   1.23
#> edu2nd grad.          1.04  1.01   1.07
#> eduOther 2nd grad.    1.05  1.01   1.09
#> eduPost-2nd grad.     1.13  1.10   1.16

Survey weighted negative binomial regression

Now, let’s fit the design-adjusted negative binomial regression model:

require(sjstats)

# Design-adjusted negative binomial - crude
fit3<- svyglm.nb(fruit.cont ~phyact2, design=w.design)
round(exp(cbind(coef(fit3), confint(fit3))),2)
#>                                2.5 %    97.5 %
#> theta.(Intercept)   28859.40 5797.98 143647.38
#> eta.(Intercept)         3.95    3.87      4.02
#> eta.phyact2Active       1.30    1.26      1.34
#> eta.phyact2Moderate     1.13    1.10      1.17

# Design-adjusted negative binomial - adjusted for covariates
fit4<-svyglm.nb(fruit.cont ~phyact2 + age + sex + income + race + bmicat + smoke + edu, 
                design=w.design)
round(exp(cbind(coef(fit4), confint(fit4))),2)
#>                                        2.5 %     97.5 %
#> theta.(Intercept)         132340.69 15727.49 1113594.94
#> eta.(Intercept)                3.24     3.04       3.44
#> eta.phyact2Active              1.27     1.23       1.31
#> eta.phyact2Moderate            1.11     1.08       1.14
#> eta.age40-49 years             1.04     1.01       1.07
#> eta.age50-59 years             1.11     1.07       1.14
#> eta.age60-64 years             1.16     1.10       1.22
#> eta.sexMale                    0.87     0.85       0.90
#> eta.income$30,000-$49,999      1.03     0.99       1.07
#> eta.income$50,000-$79,999      1.03     0.99       1.07
#> eta.income$80,000 or more      1.06     1.02       1.10
#> eta.raceWhite                  1.02     0.99       1.06
#> eta.bmicatOverweight           0.99     0.97       1.02
#> eta.bmicatUnderweight          1.03     0.97       1.09
#> eta.smokeFormer smoker         1.10     1.06       1.14
#> eta.smokeNever smoker          1.15     1.11       1.19
#> eta.edu2nd grad.               1.03     0.98       1.07
#> eta.eduOther 2nd grad.         1.03     0.98       1.09
#> eta.eduPost-2nd grad.          1.14     1.09       1.19

Video content (optional)

Tip

For those who prefer a video walkthrough, feel free to watch the video below, which offers a description of an earlier version of the above content.