KDDCup深度学习

发布时间 2023-05-16 23:47:09作者: lisyr
import pandas as pd
import torch
import torchvision
import torch.nn as nn
import numpy as np
import torch.utils.data as Data
from sklearn import preprocessing
import matplotlib.pyplot as plt

epochs = 20
batch_size = 64
lr = 0.001

# 我直接将官网的格式改成了csv文件
train_data = pd.read_csv('./data/train_10_percent.csv', header=None)
test_data = pd.read_csv('./data/test.csv', header=None)
# 分类任务,将测试集中多余的17种类别去掉
test_data = test_data[test_data[41].isin(set(train_data[41]))]
data = pd.concat((train_data, test_data), ignore_index=True)

# 特征和标签编码,删去了19列
le = preprocessing.LabelEncoder()
# 特征值编码
data[1] = le.fit_transform(data[1])
data[2] = le.fit_transform(data[2])
data[3] = le.fit_transform(data[3])
# 将normal.标签设置为1, 非normal.标签设置为0
data.loc[data[41] != 'normal.', 41] = 0
data.loc[data[41] == 'normal.', 41] = 1
data[41] = data[41].astype('int64')

# 第19列的特征全为0,无用,删掉
del data[19]
data.columns = list(range(41))

# 对特征值归一化
for i in range(40):
    Max, Min = max(data.loc[:, i]), min(data.loc[:, i])
    data.loc[:, i] = ((data.loc[:, i] - Min) / (Max - Min)).astype('float32')

# 制作pytorch识别的数据集和定义模型
train_data, train_label = torch.Tensor(data.loc[:494021, :39].values), torch.Tensor(data.loc[:494021, 40].values).long()
test_data, test_label = torch.Tensor(data.loc[494021:, :39, ].values), torch.Tensor(data.loc[494021:, 40].values).long()

train_dataset = Data.TensorDataset(train_data, train_label)
test_dataset = Data.TensorDataset(test_data, test_label)

# 制作Dataloder数据集,可迭代
train_loader = Data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = Data.DataLoader(test_dataset, batch_size=128)

# 如果是用gpu,就用gpu训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 定义模型
num_inputs, num_hiddens, num_outputs = 40, 128, 23
net = nn.Sequential(
    nn.Linear(num_inputs, num_hiddens),
    nn.ReLU(),
    nn.Linear(num_hiddens, 2 * num_hiddens),
    nn.ReLU(),
    nn.Linear(2 * num_hiddens, num_outputs)
)
net.to(device)
# 定义损失函数
loss = torch.nn.CrossEntropyLoss()
# 定义优化器
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# 训练
def train():
    net.train()
    batch_loss, correct, total = 0.0, 0.0, 0.0
    for data, label in train_loader:
        data, label = data.to(device), label.to(device)
        net.zero_grad()
        output = net(data)
        l = loss(output, label)
        l.backward()
        optimizer.step()

        predict_label = torch.argmax(output, dim=1)
        correct += torch.sum(predict_label == label).cpu().item()
        total += len(label)
        batch_loss += l.cpu().item()

    return correct / total, batch_loss / len(train_loader)


# 绘图
def pltfigure(x, y, title, id, data):
    plt.subplot(2, 2, id)
    plt.plot(range(len(data)), data)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(title)
    plt.show()


# 测试
def test():
    net.eval()
    batch_loss, correct, total = 0.0, 0.0, 0.0
    for data, label in test_loader:
        data, label = data.to(device), label.to(device)

        output = net(data)
        batch_loss += loss(output, label).cpu().item()
        predict_label = torch.argmax(output, dim=1)
        correct += torch.sum(predict_label == label).cpu().item()
        total += len(label)

    return correct / total, batch_loss / len(test_loader)


# 主程序
def main():
    print('training on: ', device)
    print('batch_size:', batch_size)
    print('epochs:', epochs)
    print('learning_rate:', lr)
    plt.figure()

    train_acc_list, train_loss_list, test_acc_list, test_loss_list = [], [], [], []
    for epoch in range(epochs):
        train_acc, train_loss = train()
        test_acc, test_loss = test()

        print('epoch %d:  train acc: %.2f%% train loss:%.4f,  test acc: %.2f%%, test loss:%.4f'
              % (epoch, 100 * train_acc, train_loss, 100 * test_acc, test_loss))

        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        test_acc_list.append(test_acc)
        test_loss_list.append(test_loss)


    #绘图
    pltfigure(x='epoch', y='acc',  title='epoch-train_acc', id=1, data=train_acc_list)
    pltfigure(x='epoch', y='loss', title='epoch-train_loss',id=2, data= train_loss_list)
    pltfigure(x='epoch', y='acc',  title='epoch-test_acc',  id=3, data=test_acc_list)
    pltfigure(x='epoch', y='loss', title='epoch-test_loss', id=4, data=test_loss_list)

main()