charls数据库统计分析为什么要控制协变量
那么,为什么要控制协变量呢,因为它可能会带来混杂偏倚、增加模型误差等。
混杂偏倚是指一个或多个隐藏的因素(协变量)同时影响自变量和因变量,导致观察到的关系产生偏移。如,在分析吸烟与健康的关系时,年龄可能既与吸烟行为相关,也与健康结果相关。因此,如果不控制年龄,吸烟和健康之间的关系可能会被夸大或掩盖。
控制协变量可以减少这些混杂因素的影响,从而使得自变量与因变量之间的关系更加准确。例如,控制年龄、性别、收入水平等因素后,可以更加清晰地评估吸烟对健康的独立影响。
另外因为CHARLS是一个追踪访问数据,因此会每一个新的wave增加新的受访者,也可能某个wave中之前受访者会缺席等等不确定因素。这就导致我们按照nhanes的逻辑处理数据会有点困难且不正确。
常见的协变量及提取方法
人口社会学变量:年龄、性别、教育水平、城乡居住、婚姻状况等
健康行为:吸烟、饮酒等
健康状况:慢性病、BMI等
上述变量均可直接从harmonised数据中获取【CHARLS官方发布的harmonised数据,对wave 1 2 3 4进行了full join,并且对一些常规变量进行处理,极其方便使用】
design <- c(
"r1agey_H_CHARLS_D_Data",#age2011-harmonised
"ragender_H_CHARLS_D_Data",#性别-harmonised
"raeduc_c_H_CHARLS_D_Data",#education-harmonised
"r1mstath_H_CHARLS_D_Data",#婚姻状态2011-harmonised
"r1rural2_H_CHARLS_D_Data",#居住地2011-harmonised
"r1smokev_H_CHARLS_D_Data",#吸烟ever2011-harmonised
"r1smoken_H_CHARLS_D_Data",#吸烟now2011-harmonised
"r1drinkn_c_H_CHARLS_D_Data",#饮酒2011-harmonised
"r1mbmi_H_CHARLS_D_Data",#BMI2011-harmonised
# chronic 14种
"r1hibpe_H_CHARLS_D_Data", #高血压2011-harmonised
"r1diabe_H_CHARLS_D_Data", #糖尿病2011-harmonised
"r1cancre_H_CHARLS_D_Data", #癌症2011-harmonised
"r1lunge_H_CHARLS_D_Data", #肺部疾病2011-harmonised
"r1hearte_H_CHARLS_D_Data", #心脏疾病2011-harmonised
"r1stroke_H_CHARLS_D_Data", #中风2011-harmonised
"r1psyche_H_CHARLS_D_Data", #精神疾病2011-harmonised
"r1arthre_H_CHARLS_D_Data", #关节炎2011-harmonised
"r1dyslipe_H_CHARLS_D_Data", #血脂异常2011-harmonised
"r1livere_H_CHARLS_D_Data", #肝脏疾病2011-harmonised
"r1kidneye_H_CHARLS_D_Data", #肾脏疾病2011-harmonised
"r1digeste_H_CHARLS_D_Data", #消化系统疾病2011-harmonised
"r1asthmae_H_CHARLS_D_Data", #哮喘2011-harmonised
"r1memrye_H_CHARLS_D_Data"#记忆问题2011-harmonised
)
column_names <- get_descriptions(design)
# 获取原始数据
df <- fetch_CHARLS_data(design, merge_method ="left",column_names)
# 处理数据
step1 <- df %>%
mutate(sex = case_when(
ragender_r_gender == "2.woman" ~ "Female",
ragender_r_gender == "1.man" ~ "Male",
TRUE ~ ragender_r_gender
#如果 ragender_r_gender 的值不是 "2.woman" 或 "1.man",则保留原始值不变
#通常用于处理其他可能的值或者缺失值(NA)
))%>%
mutate(Marital = case_when(
r1mstath_w1_r_marital_status %in% c("1.married", "2.married, sp abs") ~ "Married",
r1mstath_w1_r_marital_status %in% c("7.widowed", "5.divorced", "4.separated", "8.never married") ~ "Non-married",
TRUE ~ NA_character_
))%>%
mutate(Education = case_when(
raeduc_c_r_education_ %in% c("1.No formal education illiterate",
"2.Did not finish primary school but capable of reading and/or writing",
"3.Sishu",
"4.Elementary school") ~ "Primary school or below",
raeduc_c_r_education_ == "5.Middle school" | raeduc_c_r_education_ == "6.High school" ~ "High school",
raeduc_c_r_education_ %in% c("7.Vocational school",
"8.Two/Three Year College/Associate degree",
"9.Four Year College/Bachelor's degree",
"10.Post-graduated(Master/PhD)") ~ "College or above",
TRUE ~ NA_character_
))%>%
mutate(Location = case_when(
r1rural2_w1_r_rural_hukou == "1.Rural hukou" ~ "Village",
r1rural2_w1_r_rural_hukou == "0.Urban hukou" ~ "City/town",
TRUE ~ NA_character_ # 保留 NA
))%>%
mutate(Smoking = case_when(
r1smokev_w1_r_smoke_ever == "0.No" ~ "Non-smoker", # 从未吸烟
r1smokev_w1_r_smoke_ever == "1.Yes" & r1smoken_w1_r_smoke_now == "1.Yes" ~ "Current smoker", # 目前吸烟
r1smokev_w1_r_smoke_ever == "1.Yes" & r1smoken_w1_r_smoke_now == "0.No" ~ "Ex-smoker", # 过去吸烟但现在不吸
TRUE ~ NA_character_ # 保留 NA
))%>%
mutate(Drinking = case_when(
r1drinkn_c_w1_r_frequency_of_drinking_last_year == "0.None" ~ "None of these",
r1drinkn_c_w1_r_frequency_of_drinking_last_year == "1.Less than once a month" ~ "Drink but less than once a month",
r1drinkn_c_w1_r_frequency_of_drinking_last_year %in% c("2.Once a month", "3.2 to 3 days a month", "4.Once a week",
"5.2 to 3 days a week", "6.4 to 6 days a week", "7.Daily",
"8.Twice a day", "9.More than twice a day") ~ "Drink more than once a month",
TRUE ~ NA_character_ # 保留 NA
))%>%
mutate(BMI = r1mbmi_w1_r_measured_body_mass_index_)%>%
#BMI2011-harmonised
mutate(Age = r1agey_w1_r_age_in_years)%>%
mutate(chronic = rowSums(select(., "r1hibpe_w3_r_ever_had_high_blood_pressure",
"r1diabe_w3_r_ever_had_diabetes",
"r1cancre_w3_r_ever_had_cancer",
"r1lunge_w3_r_ever_had_lung_disease",
"r1hearte_w3_r_ever_had_heart_problem",
"r1stroke_w3_r_ever_had_stroke",
"r1psyche_w3_r_ever_had_psych_problem",
"r1arthre_w3_r_ever_had_arthritis",
"r1dyslipe_w3_r_ever_had_dyslipidemia",
"r1livere_w3_r_ever_had_liver_disease",
"r1kidneye_w3_r_ever_had_kidney_disease",
"r1digeste_w3_r_ever_had_stomach_digestive_disease",
"r1asthmae_w3_r_ever_had_asthma",
"r1memrye_w3_r_ever_had_memory_problem") == "1.Yes", na.rm = TRUE))%>%
#select(.) 表示选择当前数据框中的多个指定列
#rowSums() 函数用于计算每一行中 TRUE 的数量。TRUE 会被视为 1,FALSE 会被视为 0
mutate(Number_of_chronic_conditions = case_when(
chronic == 0 ~ "0",
chronic == 1 ~ "1",
chronic >= 2 ~ "≥2"))#慢病数量3分类
通过这种方式,一方面可以提升数据分析速度,另一方面发现NA数据量大幅下降,增加数据结果可信性。
另外,他们的harmonised数据也是方便与HRS等数据进行多国之间的比较。
多数据库联合+环境联合
5. 一键生成Kaplan-Meier曲线、森林图-小白学习周期进一步缩短!
增加了codebook查询功能,目前支持CHARLS, CHNS, CLHLS, ELSA, MHAS, SHARE数据库。 增加了数据预览功能,根据code预览对应原始数据(速度极快),便于大家理解数据的构成与内容,支持目前的7个数据库。
深夜答疑,24h并肩作战
charlsMAX R包如何购买?
购买多国健康数据库精析与挖掘课程即送charlsMAX R包,并配有常规学习代码、文章实操复现等视频学习内容(永久观看)
另有服务:社群答疑(永久)、1v1选题 审稿 投稿推荐 返修指导服务(2年)
价格:3999,学生3799(包含税,可开发票)
福利来咯!
转发此条至朋友圈+配文字“医豌豆,科研精,公共数据库助你行”(维持3小时以上),即可免费获取“配有charlsMAX相关code、变量查询的网站”,并可加入公共数据库交流社群一起学习。
课程咨询微信
豌豆老师
小宇老师
课程购买链接
行稳致远 进而有为
期待你的
分享
点赞
在看