前言:Python在统计、科研绘图方面也有较多的库,包括但不限于卡方检验、T检验、相关性分析、热力图、生存分析图等,且可应用于机器学习。临床医学科研在这方面需求较大,无论是论文、课题,源代码的意义在于调参。需要科研数据处理,可私信联系。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font',family='Times New Roman')
a = 'C:/Users/46685/Desktop/科研数据/数据汇总/新建处理后后连续变量.xlsx'
b= pd.read_excel(a,sheet_name = 'Sheet1')
b.head()
b.describe() #b为数据
np.std(b)**2
a= pd.read_excel(filePath_01,sheet_name = 'Sheet1')
ls1 = a["CA724"]
ls2 = a["年龄"]
data = pd.DataFrame({'CA724':ls1,'年龄':ls2 })
# 首先绘制出各属性关系图
sns.pairplot(data,kind='scatter',diag_kind='kde')
for column in data.columns:
u = data[column].mean() # 计算均值
std = data[column].std() # 计算标准差
r,p = scipy.stats.kstest(data[column],'norm',(u,std))
if p>0.05:
print('拒绝原假设,显著性水平为{},变量{}服从正态分布'.format(p,column))
else:
print('接受原假设,显著性水平为{},变量{}不服从正态分布'.format(p,column))
from scipy import stats
stats.pearsonr(b.胆总管扩张,b.肿块最大直径)
sns.scatterplot(x=b["胆总管扩张"], y=b["肿块硬度"])
sns.jointplot(x='BMI',y='Waist',data=b,kind='reg',height=5,color='green')
sns.jointplot(x='BMI',y='Waist',data=b,kind='hex',height=5)
sns.jointplot(x='BMI',y='Waist',data=b,kind='kde',height=5)
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(b, kind="scatter", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
df_coor=df.corr()
df_coor.head()
plt.subplots(figsize=(14,14),dpi=100,facecolor='w')# 设置画布大小,分辨率,和底色
fig=sns.heatmap(df_coor,annot=True, vmax=1, square=True, cmap="Blues", fmt='.3f')#annot为热力图上显示数据;fmt='.2f'为数据保留小数点后两位,square呈现正方形,vmax最大值为1fig
fig.get_figure().savefig('df_corr.png',bbox_inches='tight',transparent=True)#保存图片
#bbox_inches让图片显示完整,transparent=True让图片背景透明
import pandas as pd
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
import numpy as np
a = 'C:/Users/46685/Desktop/张/随访 - 修改后---汇总-提炼改后--编码1.xls'
dataset= pd.read_excel(a,sheet_name = '超声')
dataset.head() #显示前几排数据
# 生成示例数据,这里假设分为两组
data_group1 = {
'time':dataset.iloc[:,21],
'event':dataset.iloc[:,22]
}
df_group1 = pd.DataFrame(data_group1)
data_group2 = {
'time':dataset.iloc[:,18],
'event':dataset.iloc[:,22]
}
df_group2 = pd.DataFrame(data_group2)
# 合并两组数据并添加分组标识
df_combined = pd.concat([df_group1.assign(group='Group 1'), df_group2.assign(group='Group 2')])
# 分别对两组数据进行Kaplan-Meier生存分析并绘制曲线
kmf1 = KaplanMeierFitter()
kmf2 = KaplanMeierFitter()
kmf1.fit(df_combined.loc[df_combined['group'] == 'Group 1', 'time'], df_combined.loc[df_combined['group'] == 'Group 1', 'event'])
kmf2.fit(df_combined.loc[df_combined['group'] == 'Group 2', 'time'], df_combined.loc[df_combined['group'] == 'Group 2', 'event'])
# 将kmf1.survival_function_['KM_estimate']转换为numpy数组
kmf1_estimate_np = np.array(kmf1.survival_function_['KM_estimate'])
# 将kmf2.survival_function_['KM_estimate']转换为numpy数组
kmf2_estimate_np = np.array(kmf2.survival_function_['KM_estimate'])
# 将kmf1.survival_function_.index也转换为numpy数组
kmf1_index_np = np.array(kmf1.survival_function_.index)
# 将kmf2.survival_function_.index也转换为numpy数组
kmf2_index_np = np.array(kmf2.survival_function_.index)
# 绘制生存曲线,设置不同颜色和标签以便区分
plt.plot(kmf1_index_np, kmf1_estimate_np, label='Group 1', color='blue')
plt.plot(kmf2_index_np, kmf2_estimate_np, label='Group 2', color='red')
# 设置图表标题和坐标轴标签
plt.title('Kaplan-Meier Survival Curves by Group')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
# 添加图例
plt.legend()
# 显示图表
plt.show()
from feature_selector import FeatureSelector
fs = FeatureSelector( data= x, labels = y)
fs.identify_collinear(correlation_threshold=0.8, one_hot=False)
correlated_features = fs.ops['collinear']
fs.identify_zero_importance(task = 'classification',
eval_metric = 'auc',
n_iterations = 100, #n_iterations:模型训练的迭代次数;最终的特征重要性是n次迭代的平均值;
early_stopping = False) # early_stopping: True/False, 是否需要提前停止
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
lw = 2
plt.figure (figsize=(5,5),dpi=1000)
plt.rc('font',family='Times New Roman')
fs.plot_feature_importances(threshold = 0.9, plot_n = 14 )
plt.show()
plt.close()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15,random_state=0)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy',max_depth=5)
dtree.fit(x_train,y_train)
y_predict = dtree.predict(x_test)
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict)) # 输出相关结果的函数
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(16,12))
a = plot_tree(dtree, feature_names=x.columns, fontsize=12, filled=True,
class_names=['0', '1'])