Load packages and import cleaned dataframes.

library(tidyverse)
library(purrr)
library(dplyr)
library(plotly)
library(tools)

prevalence_df = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
  janitor::clean_names() |>
  select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
  filter(year == 2021)

prevalence_outcome = prevalence_df |>
  filter(category == "Health Outcomes")
  
prevalence_chol = prevalence_outcome |>
  filter(measure_id == "HIGHCHOL") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(chol_pre_2021 = disease_popu / total_popu) |>
  select(-disease_popu)
  
  
prevalence_risk = prevalence_df |>
  filter(category == "Health Risk Behaviors")
  

risk_lpa = prevalence_risk |>
  filter(measure_id == "LPA") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(lpa_pre = disease_popu / total_popu) |>
  select(-disease_popu)


risk_binge = prevalence_risk |>
  filter(measure_id == "BINGE") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(binge_pre = disease_popu / total_popu) |>
  select(-disease_popu)


risk_smoking = prevalence_risk |>
  filter(measure_id == "CSMOKING") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(smoking_pre = disease_popu / total_popu) |>
  select(-disease_popu)


prevalence_prevention = prevalence_df |>
  filter(category == "Prevention")

prevention_insurance = prevalence_prevention |>
  filter(measure_id == "ACCESS2") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(insurance_pre = disease_popu / total_popu) |>
  select(-disease_popu)

prevention_cholscreen = prevalence_prevention |>
  filter(measure_id == "CHOLSCREEN") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(cholscreen_pre = disease_popu / total_popu) |>
  select(-disease_popu)


prevalence_df_2020 = read_csv("data/PLACES__Local_Data_for_Better_Health__County_Data_2023_release.csv") |>
  janitor::clean_names() |>
  select(year, state_abbr, category, measure, data_value, total_population, category_id, measure_id, short_question_text) |>
  filter(year == 2020)

risk_sleep_2020 = prevalence_df_2020 |>
  filter(category == "Health Risk Behaviors") |>
  filter(measure_id == "SLEEP") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(sleep_pre = disease_popu / total_popu) |>
  select(-disease_popu)


prevalence_chol_2020 = prevalence_df |>
  filter(category == "Health Outcomes") |>
  filter(measure_id == "HIGHCHOL") |>
  select(state_abbr, data_value, total_population) |>
  mutate(dis_popu = as.integer(total_population * data_value * 0.01)) |>
  group_by(state_abbr) |>
  summarise(state_abbr, total_popu = sum(total_population), disease_popu = sum(dis_popu)) |>
  distinct() |>
  mutate(chol_pre_2020 = disease_popu / total_popu) |>
  select(-disease_popu)

merged_2021 = Reduce(function(df1, df2) left_join(df1, df2, by = c("state_abbr", "total_popu")), list(prevalence_chol, risk_binge, risk_lpa, risk_smoking, prevention_insurance, prevention_cholscreen))

merged_2020 = left_join(prevalence_chol_2020, risk_sleep_2020)

merged_total = left_join(merged_2021, merged_2020)

Firstly, we start with getting a general grasp of the prevalence situation of high cholesterol and 6 factors of health behaviors that have high potential to affect high cholesterol. A U.S. prevalence map to show the geographically distribution of HC in the nation, a health behaviors prevalence boxplot to show the prevalence values’ distribution of prevalence of HC and each factors. Then in the purpose of finding most-related factors to put into the modeling process next, we calculate correlation of HC between each factor and show with a table and bar plot.

Prevalence Visulization

The U.S. High Cholesterol Map

This is to have an overview of the prevalence of high cholesterol among the nation.

library(scales)

library(sf)

states = read_sf("data/cb_2022_us_state_500k/") |>
  rename("state_abbr" = "STUSPS")

chol_small_df = merged_total |>
  select(state_abbr, chol_pre_2021)

state_to_remove = c("GU", "MP", "VI", "PR", "AS", "AK", "HI")

geo_df = left_join(states, chol_small_df) |>
  rename("STUSPS" = "state_abbr") |>
  filter(! STUSPS %in% state_to_remove) 

map_plot = ggplot(geo_df) +
  geom_sf(aes(fill = chol_pre_2021, text = paste("State: ", NAME, "<br>Prevalence: ", percent(round(chol_pre_2021, digits = 3))))) +
  scale_fill_distiller("Prevalence", palette="YlOrRd", direction = 1) +
  theme_minimal() +
  theme(axis.ticks = element_blank(), axis.text = element_blank()) +
  labs(title = "Prevalence of High Cholesterol by State")

state_map = ggplotly(map_plot, tooltip = "text") 

 
state_map
  • Based on the map, we can find that the prevalences of high cholesterol varied among the nation, but were generally higher in the south than in the north of the USA.

Health Behaviors Prevalence Boxplot

Draw a boxplot to visualize the prevalence of each health behavior that is related to and has potential association with high cholesterol.

chol_pivot = merged_total |>
  select(state_abbr, chol_pre_2020, chol_pre_2021) |>
  pivot_longer(chol_pre_2020 : chol_pre_2021,
               names_to = "chol_year",
               values_to = "prevalence") |>
  mutate(chol_year = case_match(chol_year,
                                "chol_pre_2020" ~ "High Cholesterol 2020",
                                "chol_pre_2021" ~ "High Cholesterol 2021"
                                  ))


behavior_pivot  = merged_total |>
  select(-total_popu, -chol_pre_2021, -chol_pre_2020) |>
  pivot_longer(binge_pre : sleep_pre,
               names_to = "behavior",
               values_to = "prevalence") |>
  mutate(behavior = case_match(behavior,
                               "lpa_pre" ~ "No Leisure-time Physical Activity",
                               "sleep_pre" ~ "Sleep Less Than 7 Hours",
                               "smoking_pre" ~ "Smoking Currently",
                               "insurance_pre" ~ "Lack of Health Insurance",
                               "cholscreen_pre" ~ "Cholesterol Screening",
                               "binge_pre" ~ "Binge Drinking"
                               ),
         behavior = factor(behavior, levels = behavior))
  

y_range_box = range(pull(chol_pivot, prevalence), pull(behavior_pivot, prevalence))

plot1 = chol_pivot |>
  plot_ly(y = ~prevalence, color = ~chol_year, type = "box", colors = c("#F5761A", "#b73779")) |>
  layout(
         xaxis = list(title = 'Year', tickangle=-45),
         yaxis = list(range = y_range_box + c(-0.01, 0.01))
         )
#legend = list(title = list(text = "Chol Year"
plot2 = behavior_pivot |>
  plot_ly(y = ~prevalence, color = ~behavior, type = "box", colors = "viridis") |>
  layout(title = 'High Cholesterol & Behaviors Prevalence',
         xaxis = list(tickangle=-45),
         yaxis = list(range = y_range_box + c(-0.01, 0.01)))


box_plot = subplot(plot1, plot2)

box_plot
  • Based on the boxplot, we can find that prevalence of high cholesterol was similar between 2020 and 2021. Lacking sleep was the most appreciable health-adverse behavior, followed by lacking physical activities. Binge drinking and smoking problem also existed, while they were not significant compared to other health-adverse behavior. The prevalence of cholesterol screening is high, while the prevalence of lacking health insurance is low. These indicate health-promote behavior.

Find Top Behaviors

  • We try to find the top factors that affect high cholesterol most in terms of correlation

Association Trend Plot

This is to observe trendings of associations between prevalence of high cholesterol and prevalence of related health-affecting behaviors.

x_range = range(c(pull(merged_total, chol_pre_2021), pull(merged_total, chol_pre_2020)))

y_range = range(c(pull(merged_total, binge_pre), pull(merged_total, lpa_pre), pull(merged_total, smoking_pre), pull(merged_total, insurance_pre), pull(merged_total, cholscreen_pre), pull(merged_total, sleep_pre)))

prevalence_plot = 
merged_total |>
  plot_ly() |>
  add_trace(x = ~chol_pre_2021, y = ~binge_pre, type = 'scatter', size = ~total_popu, sizes = c(50, 250), mode = 'markers', opacity = 0.75, name = 'Binge Drinking', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~lpa_pre, type = 'scatter', mode = 'markers', size = ~total_popu, sizes = c(50, 250), name = 'No Leisure-time Physical Activity', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~smoking_pre, type = 'scatter', mode = 'markers', size = ~total_popu, sizes = c(50, 250), name = 'Smoking Currently', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~insurance_pre, type = 'scatter', mode = 'markers', size = ~total_popu, sizes = c(50, 250), name = 'Lack of Health Insurance Currently', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~cholscreen_pre, type = 'scatter', mode = 'markers', size = ~total_popu, sizes = c(50, 250), name = 'Cholesterol Screening', showlegend = TRUE, visible = TRUE) |>
  add_trace(x = ~chol_pre_2021, y = ~sleep_pre, type = 'scatter', mode = 'markers', size = ~total_popu, sizes = c(50, 250), name = 'Sleep Less Than 7 Hours', showlegend = TRUE, visible = TRUE) |>
  
  layout(title = list(text = 'Prevalence of High Cholesterol v.s. Health-Affecting Behaviors', x = 0.08),
         titlefont = list(size = 15),
         xaxis = list(title = 'Prevalence of High Cholesterol',  tickformat = '.0%', showline = TRUE, range = x_range + c(-0.01, 0.01)),
         yaxis = list(title = 'Prevalence of Health-Affecting Behavior', range = y_range + c(-0.05, 0.05), tickformat = '.0%', showline = TRUE))
         

prevalence_plot
  • Based on the trend, when the prevalence of high cholesterol increased, we can find the prevalence of smoking, lacking physical activity, lacking sleep, lacking health insurance, and cholesterol screening also increased. The trend of the prevalence of binge drinking is not obvious. Therefore, we move to check sub-association trending plot.

Calculate the correlation between each behavior and high cholesterol

cor_table = data.frame(behavior = character(), correlation = numeric(), stringsAsFactors = FALSE)

behavs = colnames(merged_2021[4:8])

for (each in behavs) {
  new_row = data.frame(behavior = substr(each, 1, nchar(each) - 4), correlation = round(cor(pull(merged_2021, chol_pre_2021), pull(merged_2021, each)), digits = 4))
  
  cor_table = rbind(cor_table, new_row)
}

sleep_row = data.frame(behavior = 'sleep', correlation = round(cor(pull(merged_2020, chol_pre_2020), pull(merged_2020, sleep_pre)), digits = 4))

cor_table = rbind(cor_table, sleep_row)
cor_table = arrange(cor_table, desc(abs(correlation)))

cor_table = cor_table |>
  mutate(behavior = case_match(behavior,
                               "lpa" ~ "No Leisure-time Physical Activity",
                               "sleep" ~ "Sleep Less Than 7 Hours",
                               "smoking" ~ "Smoking Currently",
                               "insurance" ~ "Lack of Health Insurance Currently",
                               "cholscreen" ~ "Cholesterol Screening",
                               "binge" ~ "Binge Drinking"
                               ),
         behavior = factor(behavior, levels = behavior)) |>
  rename("Behavior" = behavior,
         "Correlation" = correlation) 


knitr::kable(cor_table)
Behavior Correlation
No Leisure-time Physical Activity 0.6961
Sleep Less Than 7 Hours 0.6696
Binge Drinking -0.5014
Smoking Currently 0.4932
Lack of Health Insurance Currently 0.3606
Cholesterol Screening 0.2125

Correlation Bar Chart

Use a bar chart to visualize it.

cor_bar = plot_ly(cor_table, x = ~Behavior, y = ~Correlation, type = 'bar', text = ~Correlation, texttemplate = '%{y:.2f}', textposition = 'outside', opacity = 0.8, marker = list(color = "MidnightBlue")) |>
  layout(title = 'Correlation Between High Cholesterol and Health Behaviors',
         xaxis = list(title = 'Behavior', tickangle=-30),
         yaxis = list(title = 'Correlation')) 
 
  

cor_bar
  • Based on the table results and bar chart, we can find that the health-adverse behavior lacking physical activities, lacking sleep, binge drinking, and smoking had top four high correlations with high cholesterol. Binge drinking had a negative correlation with high cholesterol, which is against the common sense.

  • Therefore, we decide to examine the associations between these four health-adverse behaviors and total cholesterol, and then to provide a statistical model to predict high total cholesterol risk for users.