品牌型网站建设哪家,学做ps的软件的网站,wordpress 关闭文章修订,手机网站建设yuGitee仓库地址#xff1a;特征筛选LASSO回归封装好的代码、数据集和结果 README
LassoFeatureSelector_main
这个是主函数文件#xff0c;在实例化LassoFeatureSelector类时#xff0c;需要传入下面这些参数#xff1a;
input_train_data_path#xff1a;输入训练集的路… Gitee仓库地址特征筛选LASSO回归封装好的代码、数据集和结果 README
LassoFeatureSelector_main
这个是主函数文件在实例化LassoFeatureSelector类时需要传入下面这些参数
input_train_data_path输入训练集的路径input_test_data_path输入测试集的路径output_train_path输出训练集的路径output_test_path输出测试集的路径Upper_limit_alpha正则化搜索范围上限Lower_limit_alpha正则化搜索范围下限iterationsLASSO回归迭代次数cv选择最佳正则化系数的交叉验证次数
实例化后调用总运行函数即可lasso_selector.run_all()
LassoFeatureSelector
这个是封装好的类主要实现以下几个功能
计算特征筛选前后的方差膨胀因子输出并导出绘制岭迹图并导出以MSE为损失函数进行LASSO回归k折交叉验证进行最佳正则化系数的搜索导出特征筛选后的训练集和测试集无论输入的文件格式是xlsx文件还是csv文件类都能读取
数据集
数据集来自网络入侵检测领域的经典数据集NSLKDD
预处理好的数据集和导出的训练集测试集可以在百度网盘下载
链接https://pan.baidu.com/s/125SniuPOWFkrB4fONtIPQw?pwdfgin 提取码fgin
原始数据集见官网下载
ISCX NSL-KDD dataset 2009
导出的文件
LASSO系数矩阵.xlsxLASSO回归岭迹图.png原始训练集的方差膨胀因子.xlsxLASSO回归后训练集的方差膨胀因子.xlsxNSLKDD_train_LASSO.xlsxNSLKDD_test_LASSO.xlsx
LASSO回归参数说明
若需要调整LASSO回归的参数需要到LassoFeatureSelector文件的lasso_regression函数中进行修改
lassoreg Lasso(alphaalpha, max_iterself.iterations, fit_interceptTrue,precomputeFalse, copy_XFalse,tol0.0001, warm_startFalse,positiveFalse,selectioncyclic)
alphaalpha: 正则化强度控制稀疏性
fit_interceptTrue: 拟合截距
precomputeFalse: 是否预计算 Gram 矩阵通常设置为 False
copy_XTrue: 对输入数据进行复制
max_iterself.iterations: 最大迭代次数控制算法运行的最大迭代次数
tol0.0001: 收敛的容忍度指定算法收敛的阈值
warm_startFalse: 如果为 True则使用前一个调用的解决方案以适应的权重
positiveFalse: 如果为 True则要求系数为正
selectioncyclic: 指定系数更新的策略。cyclic 表示按循环顺序逐个更新系数
封装好的类
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import rcParams
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
import ospd.set_option(display.max_columns, None)
pd.set_option(display.max_rows, None)# 设置matplotlib绘图的中文字体
rcParams[font.sans-serif] [Microsoft YaHei]
rcParams[axes.unicode_minus] False# 忽略特定类型的警告
warnings.filterwarnings(ignore, categoryConvergenceWarning)class LassoFeatureSelector:def __init__(self, input_train_data_path, input_test_data_path, output_train_path, output_test_path, upper_limit_alpha, lower_limit_alpha, iterations, cv): # 初始化函数self.input_train_data_path input_train_data_pathself.input_test_data_path input_test_data_pathself.output_train_path output_train_pathself.output_test_path output_test_pathself.upper_limit_alpha upper_limit_alphaself.lower_limit_alpha lower_limit_alphaself.iterations iterationsself.cv cvdef load_data(self): # 读取文件函数if os.path.splitext(self.input_train_data_path)[1] .xlsx:self.input_train_data pd.read_excel(self.input_train_data_path)elif os.path.splitext(self.input_train_data_path)[1] .csv:self.input_train_data pd.read_csv(self.input_train_data_path)if os.path.splitext(self.input_test_data_path)[1] .xlsx:self.input_test_data pd.read_excel(self.input_test_data_path)elif os.path.splitext(self.input_test_data_path)[1] .csv:self.input_test_data pd.read_csv(self.input_test_data_path)def lasso_regression(self, train, test, alpha): # LASSO回归函数lassoreg Lasso(alphaalpha, max_iterself.iterations, fit_interceptTrue,precomputeFalse,copy_XFalse,tol0.0001, warm_startFalse,positiveFalse,selectioncyclic)lassoreg.fit(train.iloc[:, 0:-1], train.iloc[:, -1])feature_count np.sum(lassoreg.coef_ ! 0)y_pred lassoreg.predict(test.iloc[:, 0:-1])mse mean_squared_error(test.iloc[:, -1], y_pred)ret [alpha, mse]ret.append(feature_count)ret.extend(lassoreg.coef_)return retdef matrix_lasso(self): # LASSO系数矩阵函数self.alpha_lasso np.linspace(self.lower_limit_alpha, self.upper_limit_alpha, self.iterations)col [alpha, mse, feature_count] list(self.input_train_data.iloc[:, 0:-1])ind [alpha_%.4g % self.alpha_lasso[i] for i in range(0, len(self.alpha_lasso))]self.coef_matrix_lasso pd.DataFrame(indexind, columnscol)input_train_1, input_train_2 train_test_split(self.input_train_data, test_size0.2, random_state42) # 在输入的训练集里面分割进行LASSO回归for i in range(len(self.alpha_lasso)):self.coef_matrix_lasso.iloc[i] self.lasso_regression(input_train_1, input_train_2, self.alpha_lasso[i])self.coef_matrix_lasso.to_excel(rLASSO系数矩阵.xlsx, indexTrue)def plot_lasso_path(self): # 绘制岭迹图函数plt.figure(figsize(14, 6.8))for i in np.arange(len(list(self.input_train_data.iloc[:, 0:-1]))):plt.plot(self.coef_matrix_lasso[alpha],self.coef_matrix_lasso[list(self.input_train_data.iloc[:, 0:-1])[i]],colorplt.cm.Set1(i / len(list(self.input_train_data.iloc[:, 0:-1]))),labellist(self.input_train_data.iloc[:, 0:-1])[i])plt.legend(locupper right, ncol2, prop{size: 7})plt.xlabel(正则化系数, fontsize14)plt.ylabel(回归系数, fontsize14)plt.savefig(rLASSO回归岭迹图, dpi600)plt.show()def select_best_alpha(self): # 选择最佳正则化系数函数alpha_choose np.linspace(self.lower_limit_alpha, self.upper_limit_alpha, self.iterations)lasso_cv LassoCV(alphasalpha_choose, cvself.cv, max_iterself.iterations)lasso_cv.fit(self.input_train_data.iloc[:, 0:-1], self.input_train_data.iloc[:, -1])self.lasso_best_alpha lasso_cv.alpha_print(f选择的最佳正则化系数: {self.lasso_best_alpha})def calculate_vif(self, data): # 计算方差膨胀因子函数vif pd.DataFrame()vif[特征] data.columnsvif[方差膨胀因子] [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]return vifdef fit_lasso_model(self): # 筛选特征函数self.lasso_model Lasso(alphaself.lasso_best_alpha, fit_interceptTrue,max_iterself.iterations, random_state42, selectioncyclic)self.lasso_model.fit(self.input_train_data.iloc[:, 0:-1], self.input_train_data.iloc[:, -1])self.selected_features self.input_train_data.iloc[:, 0:-1].columns[self.lasso_model.coef_ ! 0]def save_vif(self): # 输出并保存方差膨胀因子函数vif_before self.calculate_vif(self.input_train_data.iloc[:, 0:-1])print(原始数据的方差膨胀因子:\n, vif_before)input_data_selected self.input_train_data.iloc[:, 0:-1][self.selected_features]vif_after self.calculate_vif(input_data_selected)print(筛选特征后的方差膨胀因子:\n, vif_after)vif_before.to_excel(r原始训练集的方差膨胀因子.xlsx, indexFalse)vif_after.to_excel(rLASSO回归后训练集的方差膨胀因子.xlsx, indexFalse)def save_selected_data(self): # 导出LASSO筛选特征后的训练集和测试集selected_data_train self.input_train_data[list(self.selected_features) [self.input_train_data.columns[-1]]]selected_data_test self.input_test_data[list(self.selected_features) [self.input_test_data.columns[-1]]]if os.path.splitext(self.output_train_path)[1] .xlsx:selected_data_train.to_excel(self.output_train_path, indexFalse)elif os.path.splitext(self.output_train_path)[1] .csv:selected_data_train.to_csv(self.output_train_path, indexFalse)if os.path.splitext(self.output_test_path)[1] .xlsx:selected_data_test.to_excel(self.output_test_path, indexFalse)elif os.path.splitext(self.output_test_path)[1] .csv:selected_data_test.to_csv(self.output_test_path, indexFalse)def calculate_avg_vif(self): # 计算平均方差膨胀因子函数vif_before self.calculate_vif(self.input_train_data.iloc[:, 0:-1])avg_vif_before vif_before[方差膨胀因子].mean()input_data_selected self.input_train_data.iloc[:, 0:-1][self.selected_features]vif_after self.calculate_vif(input_data_selected)avg_vif_after vif_after[方差膨胀因子].mean()print(f特征筛选前的平均方差膨胀因子: {avg_vif_before})print(f特征筛选后的平均方差膨胀因子: {avg_vif_after})def run_all(self):self.load_data()self.matrix_lasso()self.plot_lasso_path()self.select_best_alpha()self.fit_lasso_model()self.save_vif()self.save_selected_data()self.calculate_avg_vif()调用的子函数
from LassoFeatureSelector import *初始化
input_train_data_path rD:\Gitee\NSLKDD_train.xlsx # 输入训练集的路径
input_test_data_path rD:\Gitee\NSLKDD_test.xlsx # 输入测试集的路径
output_train_path rD:\Gitee\NSLKDD_train_LASSO.xlsx # 输出训练集的路径
output_test_path rD:\Gitee\NSLKDD_test_LASSO.xlsx # 输出测试集的路径
Upper_limit_alpha 0.001 # 正则化搜索范围上限
Lower_limit_alpha 0.0012 # 正则化搜索范围下限
iterations 2000 # LASSO回归迭代次数
cv 10 # 选择最佳正则化系数的交叉验证次数调用函数
lasso_selector LassoFeatureSelector(input_train_data_path, input_test_data_path, output_train_path, output_test_path, Upper_limit_alpha, Lower_limit_alpha, iterations, cv)
lasso_selector.run_all()