test
# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
import re
from sklearn.decomposition import PCA
import joblib
import shap
data = pd.read_excel(r"E:\Desktop\data.xlsx")
X = data.drop("y", axis=1)
y = data["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
lgb = LGBMRegressor(random_state=0)
param_grid = {
'n_estimators':[200,500],
'max_depth': range(3,8,2),
'learning_rate': [0.1, 0.2],
'subsample': [0.8],
'colsample_bytree': [0.8],
'num_leaves':[31, 63, 127],
}
grid = GridSearchCV(lgb, param_grid, cv=5, scoring="neg_mean_squared_error")
grid.fit(X_train, y_train)
print("best_params:", grid.best_params_)
best_lgb = grid.best_estimator_
y_pred = best_lgb.predict(X_test)
error = y_pred - y_test
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
pcc = np.corrcoef(y_test, y_pred)[0, 1]
print("mse:", mse)
print("rmse:", rmse)
print("mae:", mae)
print("r2:", r2)
print("pcc:", pcc)
plt.scatter(y_test, y_pred, c="blue")
plt.xlabel("Truth")
plt.ylabel("predict")
plt.title("Truth vs predict")
plt.show()
plt.hist(error, bins=20, color="orange")
plt.xlabel("SE")
plt.ylabel("Fruquence")
plt.title("SE distribute")
plt.show()
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="rainbow")
plt.xlabel("1st_PCA")
plt.ylabel("2rd_PCA")
plt.title("PCA result")
plt.colorbar()
plt.show()
y_train_pred = best_lgb.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)
pcc_train = np.corrcoef(y_train, y_train_pred)[0, 1]
mse_train = mean_squared_error(y_train, y_pred)
print("mae_train:", mae_train)
print("mse_train:", mse_train)
print("rmse_train:", rmse_train)
print("r2_train:", r2_train)
print("pcc_train:", pcc_train)
# 对每列x feature进行权重分析
# 使用feature_importances_属性获取每个特征的重要性分数
feature_names = X.columns # 获取特征名称
feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数
# 绘制柱状图显示每个特征的重要性
plt.bar(feature_names, feature_importances)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.title("Feature importance")
plt.show()
# 使用shap库获取每个特征的SHAP值
explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象
shap_values = explainer.shap_values(X) # 获取SHAP值
# 绘制汇总图显示每个特征的SHAP值
shap.summary_plot(shap_values, X, plot_type="bar")
joblib.dump(best_lgb, 'best_lgb.pkl')
# 调用best_lgb.pkl文件
model = joblib.load('best_lgb.pkl')
# 读取data.xlsx文件
df = pd.read_excel(r"E:\Desktop\data.xlsx")
# 删除y列
df = df.drop("y", axis=1)
# 遍历每一行的x值,输入到模型,并将预测的y值,输入到df_read.iloc[i, 18]
for i, row in df.iterrows():
# 获取x值,转换为二维数组
x = row.values.reshape(1, -1)
# 预测y值,转换为标量
y = model.predict(x)[0]
# 输入y值到df_read.iloc[i, 18]
df_read.loc[i, 18] = y