前言:Python在统计、绘图方面也有较多的库,有更强的扩展性,且机器学习目前主要为Python语言。源代码的意义在于调参。像很多科研绘图的大小、像素、色彩,还有机器学习的耦合效率,都需要经过调参,而这需要在源代码上调试。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font',family='Times New Roman')
a = 'C:/Users/46685/Desktop/科研数据/数据汇总/新建处理后后连续变量.xlsx'
b= pd.read_excel(a,sheet_name = 'Sheet1')
b.head()
b.describe() #b为数据
np.std(b)**2
a= pd.read_excel(filePath_01,sheet_name = 'Sheet1')
ls1 = a["CA724"]
ls2 = a["年龄"]
data = pd.DataFrame({'CA724':ls1,'年龄':ls2 })
# 首先绘制出各属性关系图
sns.pairplot(data,kind='scatter',diag_kind='kde')
for column in data.columns:
u = data[column].mean() # 计算均值
std = data[column].std() # 计算标准差
r,p = scipy.stats.kstest(data[column],'norm',(u,std))
if p>0.05:
print('拒绝原假设,显著性水平为{},变量{}服从正态分布'.format(p,column))
else:
print('接受原假设,显著性水平为{},变量{}不服从正态分布'.format(p,column))
from scipy import stats
stats.pearsonr(b.胆总管扩张,b.肿块最大直径)
sns.scatterplot(x=b["胆总管扩张"], y=b["肿块硬度"])
sns.jointplot(x='BMI',y='Waist',data=b,kind='reg',height=5,color='green')
sns.jointplot(x='BMI',y='Waist',data=b,kind='hex',height=5)
sns.jointplot(x='BMI',y='Waist',data=b,kind='kde',height=5)
plt.figure(figsize=(10,8), dpi= 80)
sns.pairplot(b, kind="scatter", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
plt.show()
df_coor=df.corr()
df_coor.head()
plt.subplots(figsize=(14,14),dpi=100,facecolor='w')# 设置画布大小,分辨率,和底色
fig=sns.heatmap(df_coor,annot=True, vmax=1, square=True, cmap="Blues", fmt='.3f')#annot为热力图上显示数据;fmt='.2f'为数据保留小数点后两位,square呈现正方形,vmax最大值为1fig
fig.get_figure().savefig('df_corr.png',bbox_inches='tight',transparent=True)#保存图片
#bbox_inches让图片显示完整,transparent=True让图片背景透明
from feature_selector import FeatureSelector
fs = FeatureSelector( data= x, labels = y)
fs.identify_collinear(correlation_threshold=0.8, one_hot=False)
correlated_features = fs.ops['collinear']
fs.identify_zero_importance(task = 'classification',
eval_metric = 'auc',
n_iterations = 100, #n_iterations:模型训练的迭代次数;最终的特征重要性是n次迭代的平均值;
early_stopping = False) # early_stopping: True/False, 是否需要提前停止
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
lw = 2
plt.figure (figsize=(5,5),dpi=1000)
plt.rc('font',family='Times New Roman')
fs.plot_feature_importances(threshold = 0.9, plot_n = 14 )
plt.show()
plt.close()
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15,random_state=0)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='entropy',max_depth=5)
dtree.fit(x_train,y_train)
y_predict = dtree.predict(x_test)
from sklearn.tree import plot_tree
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict)) # 输出相关结果的函数
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(16,12))
a = plot_tree(dtree, feature_names=x.columns, fontsize=12, filled=True,
class_names=['0', '1'])