今天是生信星球陪你的第1003天
公众号里的文章大多数需要编程基础,如果因为代码看不懂,而跟不上正文的节奏,可以来找我学习,相当于给自己一个新手保护期。我的课程都是循环开课,点进去咨询微信↓
生信分析直播课程(每月初开一期,春节休一个月)
生信新手保护学习小组(每月两期)
单细胞陪伴学习小组(每月两期)
预处理
缺失值处理
数据标准化
虚拟变量
数值型 没有缺失值
处理分类数据
scikit-learn: OneHotEncoder
pandas: get_dummies
import pandas as pd
music_df = pd.read_csv('music.csv')
music_dummies = pd.get_dummies(music_df["genre"], drop_first=True) #drop_first=True是删除一个虚拟变量
print(music_dummies.head())
music_dummies = pd.concat([music_df, music_dummies], axis=1)
music_dummies = music_dummies.drop("genre", axis=1)
music_dummies = pd.get_dummies(music_df, drop_first=True)
print(music_dummies.columns)
使用虚拟变量的线性回归
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
X = music_dummies.drop("popularity", axis=1).values
y = music_dummies["popularity"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
linreg = LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf,
scoring="neg_mean_squared_error")
print(np.sqrt(-linreg_cv))
可以直接运行的例子
import pandas as pd
# 创建示例数据框
data = {
'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red', 'Green', 'Yellow', 'Blue', 'Red'],
'Shape': ['Circle', 'Square', 'Triangle', 'Circle', 'Square', 'Circle', 'Triangle', 'Square', 'Triangle']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
# 使用get_dummies()转换'Color'列为虚拟变量
df_dummies = pd.get_dummies(df, columns=['Color'])
print("\nDataFrame with dummy variables:")
print(df_dummies)
缺失值处理
print(music_df.isna().sum().sort_values())
删除缺失值:dropna
缺失值插补:SimpleImputer
from sklearn.impute import SimpleImputer
X_cat = music_df["genre"].values.reshape(-1, 1)
X_num = music_df.drop(["genre", "popularity"], axis=1).values
y = music_df["popularity"].values
X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2,
random_state=12)
X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2,
random_state=12)
imp_cat = SimpleImputer(strategy="most_frequent")
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)
imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_num) #数值型数据默认用平均值填充
X_test_num = imp_num.transform(X_test_num)
X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)
用管道操作完成缺失值插补和建模
数据标准化
print(music_df[["duration_ms", "loudness", "speechiness"]].describe())
为什么标准化?
很多模型使用了点与点之间的距离 如果特征的数据范围不同,会不成比例的影响模型 KNN在预测时就明确使用了距离。
怎么标准化?
1.对每一列,减去均值并除以方差,得到均值为零,方差为1的数据,这就是标准化。 2.对每一列,家去最小值并除以方差,得到0~1范围的数据。 3.对每一列,将数据中心化,得到-1~1范围的数据。
from sklearn.preprocessing import StandardScaler
X = music_df.drop("genre", axis=1).values
y = music_df["genre"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(np.mean(X), np.std(X))
print(np.mean(X_train_scaled), np.std(X_train_scaled))
管道操作完成标准化和建模
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier(n_neighbors=6))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=21)
knn_scaled = pipeline.fit(X_train, y_train)
y_pred = knn_scaled.predict(X_test)
print(knn_scaled.score(X_test, y_test))
## 0.81
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=21)
knn_unscaled = KNeighborsClassifier(n_neighbors=6).fit(X_train, y_train)
print(knn_unscaled.score(X_test, y_test))
## 0.53
from sklearn.model_selection import GridSearchCV
steps = [('scaler', StandardScaler()),
('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)
parameters = {"knn__n_neighbors": np.arange(1, 50)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=21)
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
#检查模型参数
print(cv.best_score_)
print(cv.best_params_)
# Build the steps
steps = [("scaler", StandardScaler()),
("logreg", LogisticRegression())]
pipeline = Pipeline(steps)
# Create the parameter space
parameters = {"logreg__C": np.linspace(0.001, 1.0, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=21)
# Instantiate the grid search object
cv = GridSearchCV(pipeline, param_grid=parameters)
# Fit to the training data
cv.fit(X_train, y_train)
print(cv.best_score_, "\n", cv.best_params_)