0%

新闻主题分类任务

新闻主题分类任务

关于新闻主题分类任务

  • 以一段新闻报道中的文本描述内容为输入,使用模型帮助我们判断 它最有可能属于哪一种类型的新闻,这是典型的文本分类问题,这里假定每种类型是互斥的,即文本描述有且只有一种类型

新闻主题分类数据

通过torchtext获取数据

1
2
3
4
5
6
7
8
9
10
11
12
# 导入相关torch工具包
import torch
import torchtext
import os
# 定义数据下载路径
load_data_path = "./data"
# 如果不存在路径,则创建
if not os.path.isdir(load_data_path):
os.mkdir(load_data_path)
# 选取torchtext中的文本分类数据集"AG_NEWS"
# 并将数值映射后的训练和验证数据加载到内存中
train_dataset, test_dataset = torchtext.datasets.DATASETS["AG_NEWS"](root=load_data_path)

文件说明:

  • train.csv表示训练数据,共12万条;test.csv表示验证数据,共7600条;

处理数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
def process_datasets_by_Tokenizer(train_datasets, test_datasets, cutlen=256):
tokenizer = Tokenizer()

train_datasets_texts = []
train_datasets_labels = []
test_datasets_taxts = []
test_datasets_labels = []
for item in train_datasets:
train_datasets_labels.append(item[0] - 1) # 注意将标签映射到 [0, 3]
train_datasets_texts.append(item[1])
for item in test_datasets:
test_datasets_labels.append(item[0] - 1)
test_datasets_taxts.append(item[1])

all_datasets_texts = train_datasets_texts + test_datasets_taxts
all_datasets_labels = train_datasets_labels + test_datasets_labels

tokenizer.fit_on_texts(all_datasets_texts)
train_datasets_seqs = tokenizer.texts_to_sequences(train_datasets_texts)
test_datasets_seqs = tokenizer.texts_to_sequences(test_datasets_taxts)

# 将序列化后的语料进行截断或补齐,使它们长度一致
train_datasets_seqs = sequence.pad_sequences(train_datasets_seqs, cutlen)
test_datasets_seqs = sequence.pad_sequences(test_datasets_seqs, cutlen)

train_datasets = list(zip(train_datasets_seqs, train_datasets_labels))
test_datasets = list(zip(test_datasets_seqs, test_datasets_labels))

vocab_size = len(tokenizer.index_word.keys())
num_class = len(set(all_datasets_labels))

return train_datasets, test_datasets, vocab_size, num_class
train_datasets, test_datasets, vocab_size, num_class = process_datasets_by_Tokenizer(train_datasets, test_datasets)

新闻主题分类实现步骤

  1. 构建带有Embedding层的文本分类模型
  2. 对数据进行batch处理
  3. 构建训练与验证函数
  4. 进行模型训练和验证
  5. 查看embedding层嵌入的词向量

构建带有Embedding层的文本分类模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# 导入必备的torch模型构建工具
import torch.nn as nn
import torch.nn.functional as F

# 指定BATCH_SIZE的大小
BATCH_SIZE = 16

# 进行可用设备检测, 有GPU的话将优先使用GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextSentiment(nn.Module):
"""文本分类模型"""
def __init__(self, vocab_size, embed_dim, num_class):
"""
description: 类的初始化函数
:param vocab_size: 整个语料包含的不同词汇总数
:param embed_dim: 指定词嵌入的维度
:param num_class: 文本分类的类别总数
"""
super().__init__()
# 实例化embedding层, sparse=True代表每次对该层求解梯度时, 只更新部分权重.
self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
# 实例化线性层, 参数分别是embed_dim和num_class.
self.fc = nn.Linear(embed_dim, num_class)
# 为各层初始化权重
self.init_weights()

def init_weights(self):
"""初始化权重函数"""
# 指定初始权重的取值范围数
initrange = 0.5
# 各层的权重参数都是初始化为均匀分布
self.embedding.weight.data.uniform_(-initrange, initrange)
# self.fc.weight.data.uniform_(-initrange, initrange)
# 偏置初始化为0
self.fc.bias.data.zero_()

def forward(self, text):
"""
:param text: 文本数值映射后的结果
:return: 与类别数尺寸相同的张量, 用以判断文本类别
"""
# 获得embedding的结果embedded
# >>> embedded.shape
# (m, 32) 其中m是BATCH_SIZE大小的数据中词汇总数
embedded = self.embedding(text)
# 接下来我们需要将(m, 32)转化成(BATCH_SIZE, 32)
# 以便通过fc层后能计算相应的损失
# 首先, 我们已知m的值远大于BATCH_SIZE=16,
# 用m整除BATCH_SIZE, 获得m中共包含c个BATCH_SIZE
c = embedded.size(0) // BATCH_SIZE
# 之后再从embedded中取c*BATCH_SIZE个向量得到新的embedded
# 这个新的embedded中的向量个数可以整除BATCH_SIZE
embedded = embedded[:BATCH_SIZE*c]
# 因为我们想利用平均池化的方法求embedded中指定行数的列的平均数,
# 但平均池化方法是作用在行上的, 并且需要3维输入
# 因此我们对新的embedded进行转置并拓展维度
embedded = embedded.transpose(1, 0).unsqueeze(0)
# 然后就是调用平均池化的方法, 并且核的大小为c
# 即取每c的元素计算一次均值作为结果
embedded = F.avg_pool1d(embedded, kernel_size=c)
# 最后,还需要减去新增的维度, 然后转置回去输送给fc层
# return self.fc(embedded[0].transpose(1, 0))
return F.softmax(self.fc(embedded[0].transpose(1, 0)), dim=1)
1
2
3
4
5
6
7
8
# 获得整个语料包的不同词汇总数
VOCAB_SIZE = vocab_size
# 指定词嵌入维度
EMBED_DIM = 32
# 获取整个文本分类的总数
NUM_CLASS = num_class
# 实例化模型对象
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

对数据进行batch处理

1
2
3
4
5
6
7
def generate_batch(batch):
text = []
label = []
for item in batch:
text.extend(item[0])
label.append(item[1])
return torch.tensor(text), torch.tensor(label)
1
2
3
# 测试
batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]
print(generate_batch(batch))

(tensor([ 3, 23, 2, 8, 3, 45, 21, 6]), tensor([1, 0]))

构建训练与验证函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from torch.utils.data import DataLoader
def train(train_data):
"""模型训练函数"""
# 初始化训练损失和准确率为0
train_loss = 0
train_acc = 0
# 使用数据加载器生成BATCH_SIZE大小的数据进行批次训练
# data就是N多个generate_batch函数处理后的BATCH_SIZE大小的数据生成器
data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn = generate_batch)
for i, (text, cls) in enumerate(data):
# 设置优化器初始梯度为0
optimizer.zero_grad()
# 模型输入一个批次数据,获得输出
output = model(text.to(device))
# 根据真实标签与模型输出计算损失
loss = criterion(output, cls.to(device))
# 将该批次的损失回到总损失中
train_loss += loss.item()
# 误差反向传播
loss.backward()
# 参数进行更新
optimizer.step()
# 准确率更新
train_acc += (output.argmax(1) == cls.to(device)).sum().item()

# 调整优化器学习率
scheduler.step()

# 返回本轮训练的平均损失和平均准确率
return train_loss / len(train_data), train_acc / len(train_data)

def valid(valid_data):
"""模型验证函数"""
# 初始化验证损失和准确率为0
loss = 0
acc = 0
# 和训练相同,使用DataLoader获得训练数据生成器
data = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn = generate_batch)
for text, cls in data:
# 验证阶段,不再求梯度
with torch.no_grad():
# 使用模型获得输出
output = model(text.to(device))
# 计算损失
loss = criterion(output, cls.to(device))
# 将损失和准确率回到总损失和准确率中
loss += loss.item()
acc += (output.argmax(1) == cls.to(device)).sum().item()
return loss / len(valid_data), acc / len(valid_data)

进行模型训练和验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import time
from torch.utils.data.dataset import random_split

# 指定训练轮数
N_EPOCHS = 10
# 定义初始的验证损失
min_valid_loss = float("inf")
# 选择损失函数,这里选择预定义的交叉熵函数
criterion = torch.nn.CrossEntropyLoss().to(device)
# 选择随机梯度下降优化器
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
# 选择优化器步长调节方法StepLR,用来衰减学习率
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
# 从train_datasets取95%作为训练集
train_len = int(len(train_datasets)*0.95)
sub_train, sub_valid = random_split(train_datasets, [train_len, len(train_datasets)-train_len])

for epoch in range(N_EPOCHS):
start_time = time.time()
# 调用train和valid函数得到训练和验证的平均损失,平均准确率
train_loss, train_acc = train(sub_train)
valid_loss, valid_acc = valid(sub_valid)

secs = int(time.time() - start_time)
mins = secs / 60
secs = secs % 60

print("Epoch: %d" %(epoch+1),"| time in %d minute,% seconds" %(mins, secs))
print(f"\tLoss: {train_loss:.4f}(train)\t|\tAcc:{train_loss*100:.1f}%(train)")
print(f"\tLoss: {valid_loss:.4f}(valid)\t|\tAcc:{valid_loss*100:.1f}%(valid)")

Epoch: 1 | time in 0 minute,25econds Loss: 0.0873(train) | Acc:32.1%(train) Loss: 0.0004(valid) | Acc:35.9%(valid) Epoch: 2 | time in 0 minute,25econds Loss: 0.0826(train) | Acc:38.8%(train) Loss: 0.0005(valid) | Acc:39.4%(valid) Epoch: 3 | time in 0 minute,25econds Loss: 0.0821(train) | Acc:40.2%(train) Loss: 0.0004(valid) | Acc:34.9%(valid) Epoch: 4 | time in 0 minute,25econds Loss: 0.0804(train) | Acc:43.0%(train) Loss: 0.0005(valid) | Acc:39.1%(valid) Epoch: 5 | time in 0 minute,25econds Loss: 0.0793(train) | Acc:45.0%(train) Loss: 0.0005(valid) | Acc:31.5%(valid) Epoch: 6 | time in 0 minute,25econds Loss: 0.0778(train) | Acc:47.7%(train) Loss: 0.0004(valid) | Acc:56.5%(valid) Epoch: 7 | time in 0 minute,25econds Loss: 0.0768(train) | Acc:49.5%(train) Loss: 0.0004(valid) | Acc:37.1%(valid) Epoch: 8 | time in 0 minute,25econds Loss: 0.0754(train) | Acc:52.0%(train) Loss: 0.0004(valid) | Acc:58.4%(valid) Epoch: 9 | time in 0 minute,25econds Loss: 0.0741(train) | Acc:54.2%(train) Loss: 0.0004(valid) | Acc:48.4%(valid) Epoch: 10 | time in 0 minute,25econds Loss: 0.0731(train) | Acc:56.0%(train) Loss: 0.0005(valid) | Acc:34.3%(valid) Epoch: 11 | time in 0 minute,25econds Loss: 0.0716(train) | Acc:58.4%(train) Loss: 0.0003(valid) | Acc:68.2%(valid) Epoch: 12 | time in 0 minute,25econds Loss: 0.0706(train) | Acc:60.1%(train) Loss: 0.0003(valid) | Acc:59.5%(valid) Epoch: 13 | time in 0 minute,26econds Loss: 0.0694(train) | Acc:62.2%(train) Loss: 0.0003(valid) | Acc:69.2%(valid) Epoch: 14 | time in 0 minute,25econds Loss: 0.0684(train) | Acc:63.9%(train) Loss: 0.0003(valid) | Acc:64.7%(valid) Epoch: 15 | time in 0 minute,25econds Loss: 0.0675(train) | Acc:65.4%(train) Loss: 0.0004(valid) | Acc:65.5%(valid) Epoch: 16 | time in 0 minute,25econds Loss: 0.0664(train) | Acc:67.2%(train) Loss: 0.0003(valid) | Acc:64.6%(valid) Epoch: 17 | time in 0 minute,25econds Loss: 0.0657(train) | Acc:68.5%(train) Loss: 0.0003(valid) | Acc:70.4%(valid) Epoch: 18 | time in 0 minute,25econds Loss: 0.0650(train) | Acc:69.5%(train) Loss: 0.0004(valid) | Acc:69.0%(valid) Epoch: 19 | time in 0 minute,25econds Loss: 0.0643(train) | Acc:70.8%(train) Loss: 0.0003(valid) | Acc:76.3%(valid) Epoch: 20 | time in 0 minute,25econds Loss: 0.0636(train) | Acc:72.0%(train) Loss: 0.0003(valid) | Acc:70.5%(valid)

查看embedding层嵌入的词向量

1
print(model.state_dict()['embedding.weight'])

tensor([[-3.9321e-02, 1.2770e-02, -1.2725e-02, ..., -3.7640e-02, 5.0681e-02, 3.4286e-03], [-1.6661e+00, -5.6520e+00, -6.9105e-03, ..., -7.4342e-01, 1.5925e+00, -3.9538e-01], [ 1.2449e+00, 1.8321e+00, 8.1467e-01, ..., 4.5453e-01, -1.1000e+00, 8.3954e-01], ..., [-2.6404e-01, -4.9704e-01, 8.3933e-02, ..., -4.8199e-01, 3.0737e-01, 4.4653e-01], [-1.4254e-01, 2.1912e-01, -3.5175e-01, ..., 1.7252e-01, -4.0052e-01, -1.5885e-02], [-1.5442e-02, -2.2085e-01, -3.8362e-01, ..., -3.5968e-01, 3.6406e-01, 3.7704e-01]], device='cuda:0')