library(tidyverse)
library(haven)
library(survey)
library(knitr)
library(kableExtra)
library(usmap)
library(ggplot2)
library(scales)
source("R/utils.R")
if (!dir.exists("outputs")) dir.create("outputs")
Dataset and documentation available at https://www.cdc.gov/brfss/annual_data/annual_2024.html
# Import the 2024 BRFSS dataset (SAS Transport Format)
brfss_data <- read_xpt("data/LLCP2024.XPT")
# Generate summary table of all variables. Labels are truncated-- the full titles are available in the 2024 BRFSS Codebook CDC.
var_dict <- build_variable_dictionary(brfss_data)
head(var_dict)
# List of columns to keep
keep_vars <- c("_STATE", "_LLCPWT", "_PSU", "_STSTR",
"_AGEG5YR", "_SEX", "_RACEGR3", "_URBSTAT",
"_FLSHOT7", "_PNEUMO3", "_AIDTST4", "_CRVSCRN",
"_BMI5CAT", "_RFSMOK3")
# Create smaller dataframe for more efficient analysis
brfss_small <- brfss_data[, keep_vars]
var_dict_small <- build_variable_dictionary(brfss_small)
# Optional after creating the smaller working dataset
rm(brfss_data) # Remove the large original
gc() # Run garbage collection to free memory
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2864174 153.0 5457597 291.5 5457597 291.5
Vcells 12361681 94.4 306640558 2339.5 383159014 2923.3
Variable _FLSHOT7 already only includes those 65+. A
response of ‘9’ indicates ‘Don’t know/Not Sure’ Or ‘Refused/Missing’.
Dropped for analysis.
# Clean the flu shot variable and remove missing codes
brfss_small$flu_shot_clean <- ifelse(brfss_small$`_FLSHOT7` %in% c(9, NA), NA, brfss_small$`_FLSHOT7`)
# Calculate percentage of '9's in each state to make sure state-by-state comparison is valid after dropping
pct_9_by_state <- brfss_small %>%
group_by(`_STATE`) %>%
summarize(
n_total = n(),
n_9 = sum(`_FLSHOT7` == 9, na.rm = TRUE),
pct_9 = n_9 / n_total * 100
) %>%
arrange(desc(pct_9)) # sort from highest to lowest
pct_9_by_state
% of respondents answering “9” for _FLSHOT7 varies
between 8.05 and 1.25% state-by-state. As it’s relatively low, it seems
unlikely that it would meaningfully bias interstate comparisons.
# Adjust for "lonely" PSUs (strata with only one PSU) in variance estimation
options(survey.lonely.psu = "adjust")
# Define the survey design for flu vaccination data
flu_design <- svydesign(
id = ~`_PSU`, # Primary Sampling Unit
strata = ~`_STSTR`, # Stratification variable
weights = ~`_LLCPWT`, # Survey weights
data = brfss_small,
nest = TRUE # Indicates nested structure
)
# Estimate the mean flu vaccination rate
flu_est <- svymean(~factor(flu_shot_clean), design = flu_design, na.rm = TRUE)
flu_est
mean SE
factor(flu_shot_clean)1 0.6253 0.003
factor(flu_shot_clean)2 0.3747 0.003
# Format results for readability
flu_yes <- coef(flu_est)[1]
flu_ci <- confint(flu_est)[1, ]
cat(sprintf(
"Flu vaccination rate (age 65+): %.1f%% (95%% CI: %.1f%%–%.1f%%)\n",
flu_yes * 100,
flu_ci[1] * 100,
flu_ci[2] * 100
))
Flu vaccination rate (age 65+): 62.5% (95% CI: 61.9%–63.1%)
flu_by_state <- svyby(
~factor(flu_shot_clean),
~`_STATE`,
design = flu_design,
FUN = svymean,
na.rm = TRUE
)
# Map state codes to names
state_labels <- c(
`1` = "Alabama", `2` = "Alaska", `4` = "Arizona", `5` = "Arkansas",
`6` = "California", `8` = "Colorado", `9` = "Connecticut", `10` = "Delaware",
`11` = "District of Columbia", `12` = "Florida", `13` = "Georgia", `15` = "Hawaii",
`16` = "Idaho", `17` = "Illinois", `18` = "Indiana", `19` = "Iowa",
`20` = "Kansas", `21` = "Kentucky", `22` = "Louisiana", `23` = "Maine",
`24` = "Maryland", `25` = "Massachusetts", `26` = "Michigan", `27` = "Minnesota",
`28` = "Mississippi", `29` = "Missouri", `30` = "Montana", `31` = "Nebraska",
`32` = "Nevada", `33` = "New Hampshire", `34` = "New Jersey", `35` = "New Mexico",
`36` = "New York", `37` = "North Carolina", `38` = "North Dakota", `39` = "Ohio",
`40` = "Oklahoma", `41` = "Oregon", `42` = "Pennsylvania", `44` = "Rhode Island",
`45` = "South Carolina", `46` = "South Dakota", `48` = "Texas", `49` = "Utah",
`50` = "Vermont", `51` = "Virginia", `53` = "Washington", `54` = "West Virginia",
`55` = "Wisconsin", `56` = "Wyoming", `66` = "Guam", `72` = "Puerto Rico",
`78` = "Virgin Islands"
)
# Convert state variable in flu_by_state to the labels
flu_by_state$State <- state_labels[as.character(flu_by_state$`_STATE`)]
flu_by_state
# 95% CI using normal approximation
flu_by_state <- flu_by_state %>%
mutate(
flu_rate = `factor(flu_shot_clean)1`, # your column with vaccination rate (1 = yes)
flu_se = `se.factor(flu_shot_clean)1`, # your column with SE
lower_ci = flu_rate - 1.96 * flu_se,
upper_ci = flu_rate + 1.96 * flu_se
)
# Create table of state flu vaccination data
flu_by_state %>%
select(State, flu_rate, lower_ci, upper_ci) %>%
arrange(desc(flu_rate)) %>%
kable(digits = 3,
col.names = c("State", "Vaccination Rate", "Lower CI", "Upper CI"),
caption = "Flu Vaccination Rates (Age 65+) by State with 95% CIs") %>%
kable_styling(full_width = FALSE)
| State | Vaccination Rate | Lower CI | Upper CI | |
|---|---|---|---|---|
| 50 | Vermont | 0.731 | 0.708 | 0.754 |
| 25 | Massachusetts | 0.725 | 0.703 | 0.746 |
| 33 | New Hampshire | 0.715 | 0.696 | 0.734 |
| 24 | Maryland | 0.708 | 0.689 | 0.727 |
| 44 | Rhode Island | 0.706 | 0.677 | 0.735 |
| 11 | District of Columbia | 0.705 | 0.668 | 0.742 |
| 23 | Maine | 0.699 | 0.681 | 0.716 |
| 9 | Connecticut | 0.695 | 0.664 | 0.726 |
| 10 | Delaware | 0.690 | 0.660 | 0.721 |
| 51 | Virginia | 0.684 | 0.657 | 0.711 |
| 27 | Minnesota | 0.677 | 0.660 | 0.693 |
| 53 | Washington | 0.672 | 0.659 | 0.685 |
| 8 | Colorado | 0.671 | 0.653 | 0.690 |
| 37 | North Carolina | 0.669 | 0.635 | 0.703 |
| 41 | Oregon | 0.665 | 0.638 | 0.692 |
| 6 | California | 0.655 | 0.627 | 0.683 |
| 21 | Kentucky | 0.649 | 0.623 | 0.676 |
| 31 | Nebraska | 0.649 | 0.630 | 0.668 |
| 19 | Iowa | 0.649 | 0.627 | 0.670 |
| 35 | New Mexico | 0.645 | 0.609 | 0.681 |
| 29 | Missouri | 0.642 | 0.615 | 0.670 |
| 36 | New York | 0.636 | 0.620 | 0.653 |
| 46 | South Dakota | 0.636 | 0.577 | 0.695 |
| 55 | Wisconsin | 0.636 | 0.619 | 0.653 |
| 34 | New Jersey | 0.634 | 0.604 | 0.663 |
| 20 | Kansas | 0.633 | 0.613 | 0.653 |
| 42 | Pennsylvania | 0.631 | 0.588 | 0.675 |
| 26 | Michigan | 0.629 | 0.609 | 0.649 |
| 15 | Hawaii | 0.627 | 0.600 | 0.655 |
| 18 | Indiana | 0.625 | 0.606 | 0.643 |
| 39 | Ohio | 0.623 | 0.599 | 0.646 |
| 5 | Arkansas | 0.620 | 0.594 | 0.646 |
| 17 | Illinois | 0.619 | 0.597 | 0.641 |
| 38 | North Dakota | 0.612 | 0.588 | 0.636 |
| 54 | West Virginia | 0.611 | 0.588 | 0.634 |
| 40 | Oklahoma | 0.603 | 0.580 | 0.626 |
| 49 | Utah | 0.600 | 0.579 | 0.621 |
| 28 | Mississippi | 0.598 | 0.552 | 0.644 |
| 45 | South Carolina | 0.597 | 0.573 | 0.621 |
| 4 | Arizona | 0.596 | 0.563 | 0.628 |
| 1 | Alabama | 0.595 | 0.567 | 0.624 |
| 13 | Georgia | 0.589 | 0.557 | 0.620 |
| 30 | Montana | 0.583 | 0.561 | 0.605 |
| 48 | Texas | 0.578 | 0.541 | 0.615 |
| 16 | Idaho | 0.557 | 0.524 | 0.591 |
| 12 | Florida | 0.557 | 0.529 | 0.585 |
| 56 | Wyoming | 0.545 | 0.520 | 0.570 |
| 66 | Guam | 0.543 | 0.300 | 0.787 |
| 22 | Louisiana | 0.537 | 0.507 | 0.567 |
| 32 | Nevada | 0.525 | 0.469 | 0.580 |
| 2 | Alaska | 0.520 | 0.485 | 0.555 |
| 72 | Puerto Rico | 0.446 | 0.389 | 0.503 |
| 78 | Virgin Islands | 0.308 | 0.187 | 0.429 |
flu_by_state <- flu_by_state %>%
mutate(state = state.abb[match(State, state.name)])
flu_by_state <- flu_by_state %>%
mutate(flu_rate = `factor(flu_shot_clean)1`)
plot_usmap(
data = flu_by_state,
values = "flu_rate",
regions = "states"
) +
scale_fill_viridis_c(
labels = percent_format(accuracy = 1),
name = "Flu Vaccination Rate"
) +
theme(legend.position = "right") +
labs(title = "Flu Vaccination Rate (Age 65+) by State")
ggsave("outputs/flu_map.png", width = 10, height = 6, dpi = 300)
Note: Tennessee is absent from the 2024 BRFSS public dataset. CDC BRFSS 2024 documentation.
# Estimate mean flu vaccination by urban/rural
flu_urban_est <- svyby(
~factor(flu_shot_clean),
~`_URBSTAT`,
design = flu_design,
FUN = svymean,
na.rm = TRUE
)
# Format results
flu_urban_est <- flu_urban_est %>%
mutate(
flu_rate = `factor(flu_shot_clean)1`, # vaccination = 1 (yes)
flu_se = `se.factor(flu_shot_clean)1`,
lower_ci = flu_rate - 1.96 * flu_se,
upper_ci = flu_rate + 1.96 * flu_se,
urban_rural = ifelse(`_URBSTAT` == 1, "Urban", "Rural")
)
# Display
flu_urban_est[, c("urban_rural", "flu_rate", "lower_ci", "upper_ci")]
# Weighted mean by race (65+), ignoring 9 (Don't know/Refused)
flu_race_est <- svyby(
~factor(flu_shot_clean),
~factor(ifelse(`_RACEGR3` %in% c(1,2,3,4,5), `_RACEGR3`, NA)),
design = flu_design,
FUN = svymean,
na.rm = TRUE
)
# Official 5-level race/ethnicity labels
race_labels <- c(
"White only, Non-Hispanic",
"Black only, Non-Hispanic",
"Other race only, Non-Hispanic",
"Multiracial, Non-Hispanic",
"Hispanic"
)
# Explicit column names for "Yes" and its SE
rate_col <- "factor(flu_shot_clean)1"
se_col <- "se.factor(flu_shot_clean)1"
# Print percentages with 95% CIs
for (i in seq_along(race_labels)) {
rate <- flu_race_est[[rate_col]][i]
se <- flu_race_est[[se_col]][i]
lower <- rate - 1.96 * se
upper <- rate + 1.96 * se
cat(sprintf(
"Flu vaccination rate (%s, 65+): %.1f%% (95%% CI: %.1f%%–%.1f%%)\n",
race_labels[i],
rate * 100,
lower * 100,
upper * 100
))
}
Flu vaccination rate (White only, Non-Hispanic, 65+): 65.0% (95% CI: 64.4%–65.6%)
Flu vaccination rate (Black only, Non-Hispanic, 65+): 57.0% (95% CI: 54.7%–59.2%)
Flu vaccination rate (Other race only, Non-Hispanic, 65+): 59.3% (95% CI: 55.4%–63.1%)
Flu vaccination rate (Multiracial, Non-Hispanic, 65+): 56.0% (95% CI: 51.1%–61.0%)
Flu vaccination rate (Hispanic, 65+): 54.8% (95% CI: 51.9%–57.7%)
# Inspect column names once
names(flu_race_est)
[1] "factor(ifelse(`_RACEGR3` %in% c(1, 2, 3, 4, 5), `_RACEGR3`, NA))" "factor(flu_shot_clean)1" "factor(flu_shot_clean)2"
[4] "se.factor(flu_shot_clean)1" "se.factor(flu_shot_clean)2"
# Adjust for "lonely" PSUs (strata with only one PSU) in variance estimation
options(survey.lonely.psu = "adjust")
# Estimate mean flu vaccination by male (1) or female (2)
flu_sex_est <- svyby(
~factor(flu_shot_clean),
~`_SEX`,
design = flu_design,
FUN = svymean,
na.rm = TRUE
)
# Format results
flu_sex_est <- flu_sex_est %>%
mutate(
flu_rate = `factor(flu_shot_clean)1`, # vaccination = 1 (yes)
flu_se = `se.factor(flu_shot_clean)1`,
lower_ci = flu_rate - 1.96 * flu_se,
upper_ci = flu_rate + 1.96 * flu_se,
male_female = case_when(
`_SEX` == 1 ~ "Male",
`_SEX` == 2 ~ "Female",
TRUE ~ NA_character_
)
)
# Display
flu_sex_est[, c("male_female", "flu_rate", "lower_ci", "upper_ci")]