0%

人名分类器

人名分类器

关于人名分类问题

  • 以一个人名为输入,使用模型帮助我们判断它最有可能来自哪个国家的人名,这在某些国际化公司的业务中具有重要意义,在用户注册过程中, 会根据用户填写的名字直接给他分配可能的国家或地区选项,以及该国家或地区的国旗,限制手机号码位数等

人名分类数据

实现步骤

  1. 导入必备工具包
  2. 对data文件中的数据进行处理,满足训练要求
  3. 构建RNN模型(包括传统RNN、LSTM及GRU)
  4. 构建训练函数并进行训练
  5. 构建评估函数并进行预测

导入必备工具包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 从io中导入文件打开方法
from io import open
# 帮助使用正则表达式子目录的查询
import glob
import os
# 用于获得常见字母及字符规范化
import string
import unicodedata
# 导入随机工具random
import random
# 导入时间和数学工具包
import time
import math
# 导入torch工具
import torch
# 导入nn准备构建模型
import torch.nn as nn
# 引入制图工具包
import matplotlib.pyplot as plt

对data文件中的数据进行处理,满足训练要求

1
2
3
4
5
# 获取所有常用字符包括字母和常用标点
all_letters = string.ascii_letters + ".,;"
# 获取常用字符数量
n_letters = len(all_letters)
print("n_letter:", n_letters)

n_letter: 55

1
2
3
4
5
6
7
# 字符规范化-unicode转Ascii函数
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
1
2
3
4
# 测试
s = "Iñíguez"
a = unicodeToAscii(s)
print(a)

Iniguez

1
2
3
4
5
6
7
8
9
# 从从持久化文件中读取的函数
data_path = "./data/names/"
def read_lines(filename):
lines = open(filename, encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
# 测试
filename = data_path + "Chinese.txt"
lines = read_lines(filename)
print(lines[:10])

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao', 'Bei', 'Bian', 'Bui', 'Cai', 'Cao']

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 构建人名类别列表和人名对应关系字典
category_lines = {}
all_categories = []
for filename in glob.glob(data_path + "*.txt"):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = read_lines(filename)
category_lines[category] = lines

# 查看类别总数
n_categories = len(all_categories)
print("n_categories:", n_categories)
# 查看一些内容
print(category_lines['Italian'])

n_categories: 18

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

1
2
3
4
5
6
7
8
9
# 将人名转化对应onehot张量
def line_to_tensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li, letter in enumerate(line):
tensor[li][0][all_letters.find(letter)] = 1
return tensor

line = "Bai"
print("line_tensor:", line_to_tensor(line))

line_tensor: tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

构建RNN模型

构建传统RNN模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# 使用nn.RNN构建完成传统RNN使用类

class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
"""初始化函数中有4个参数, 分别代表RNN输入最后一维尺寸, RNN的隐层最后一维尺寸, RNN层数"""
super(RNN, self).__init__()
# 将hidden_size与num_layers传入其中
self.hidden_size = hidden_size
self.num_layers = num_layers

# 实例化预定义的nn.RNN, 它的三个参数分别是input_size, hidden_size, num_layers
self.rnn = nn.RNN(input_size, hidden_size, num_layers)
# 实例化nn.Linear, 这个线性层用于将nn.RNN的输出维度转化为指定的输出维度
self.linear = nn.Linear(hidden_size, output_size)
# 实例化nn中预定的Softmax层, 用于从输出层获得类别结果
self.softmax = nn.LogSoftmax(dim=-1)


def forward(self, input, hidden):
"""完成传统RNN中的主要逻辑, 输入参数input代表输入张量, 它的形状是1 x n_letters
hidden代表RNN的隐层张量, 它的形状是self.num_layers x 1 x self.hidden_size"""
# 因为预定义的nn.RNN要求输入维度一定是三维张量, 因此在这里使用unsqueeze(0)扩展一个维度
input = input.unsqueeze(0)
# 将input和hidden输入到传统RNN的实例化对象中,如果num_layers=1, rr恒等于hn
rr, hn = self.rnn(input, hidden)
# 将从RNN中获得的结果通过线性变换和softmax返回,同时返回hn作为后续RNN的输入
return self.softmax(self.linear(rr)), hn


def init_hidden(self):
"""初始化隐层张量"""
# 初始化一个(self.num_layers, 1, self.hidden_size)形状的0张量
return torch.zeros(self.num_layers, 1, self.hidden_size)

构建LSTM模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers

self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=-1)

def forward(self, input, hidden, c):
input = input.unsqueeze(0)
rr, (hn, c) = self.lstm(input, (hidden, c))
return self.softmax(self.linear(rr)), hn, c

def init_hidden_and_c(self):
c = hidden = torch.zeros(self.num_layers, 1, self.hidden_size)
return hidden, c

构建GRU模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class GRU(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super(GRU, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers)
self.linear = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=-1)

def forward(self, input, hidden):
input = input.unsqueeze(0)
rr, hn = self.gru(input, hidden)
return self.softmax(self.linear(rr)), hn

def init_hidden(self):
return torch.zeros(self.num_layers, 1, self.hidden_size)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
input_size = n_letters
n_hidden = 128 # 定义隐层最后一维尺寸大小
output_size = n_categories
input = line_to_tensor('B').squeeze(0)
hidden = c = torch.zeros(1, 1, n_hidden)
rnn = RNN(input_size, n_hidden, output_size)
lstm = LSTM(input_size, n_hidden, output_size)
gru = GRU(input_size, n_hidden, output_size)

rnn_output, next_hidden = rnn(input, hidden)
print("rnn:", rnn_output)
print(rnn_output.shape)
lstm_output, next_hidden, c = lstm(input, hidden, c)
print("lstm:", lstm_output)
print(lstm_output.shape)
gru_output, next_hidden = gru(input, hidden)
print("gru:", gru_output)
print(gru_output.shape)

rnn: tensor([[[-2.7586, -2.9267, -2.9233, -2.9483, -2.9034, -2.8620, -3.0104, -2.9470, -2.7736, -2.8324, -2.8893, -2.9148, -2.8490, -2.9128, -3.0092, -2.8207, -2.8846, -2.9011]]], grad_fn=)

torch.Size([1, 1, 18])

lstm: tensor([[[-2.8870, -2.8785, -2.8590, -2.9680, -2.8337, -2.8333, -2.8774, -2.9016, -2.8173, -2.9711, -2.9113, -2.8614, -2.8533, -2.8560, -2.8948, -2.9332, -2.9586, -2.9509]]], grad_fn=)

torch.Size([1, 1, 18])

gru: tensor([[[-2.9899, -2.8605, -2.8580, -2.8376, -2.9263, -2.9115, -2.8091, -2.9688, -2.9039, -2.9790, -2.8622, -2.9508, -2.8404, -2.9036, -2.8456, -2.8766, -2.8663, -2.8602]]], grad_fn=)

torch.Size([1, 1, 18])

构建训练函数并进行训练

1
2
3
4
5
6
7
def category_from_output(output):
top_n, top_i = output.topk(1)
category_i = top_i[0].item()
return all_categories[category_i], category_i
x = torch.arange(1., 6.)
print(x)
print(torch.topk(x, 3))

tensor([1., 2., 3., 4., 5.])

torch.return_types.topk( values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))

1
2
3
output = gru_output
category, category_i = category_from_output(output)
print(category, category_i)

Portuguese 15

1
2
3
4
5
6
7
8
9
10
11
# 随机产生训练数据
def random_train_example():
category = random.choice(all_categories)
line = random.choice(category_lines[category])
category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
line_tensor = line_to_tensor(line)
return category, line, category_tensor, line_tensor
# 测试
for i in range(10):
category, line, category_tensor, line_tensor = random_train_example()
print("category=",category,"/ line=", line, "/category_tensor=", category_tensor)

category= Czech / line= Tykal /category_tensor= tensor([5])

category= Japanese / line= Tsukatani /category_tensor= tensor([9])

category= Chinese / line= Xin /category_tensor= tensor([0])

category= Italian / line= Albanesi /category_tensor= tensor([12])

category= Japanese / line= Fujimaki /category_tensor= tensor([9])

category= Scottish / line= Black /category_tensor= tensor([14])

category= French / line= Bernard /category_tensor= tensor([7])

category= Greek / line= Tselios /category_tensor= tensor([8])

category= Chinese / line= Chen /category_tensor= tensor([0])

category= German / line= Hauer /category_tensor= tensor([16])

构建传统RNN训练函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
criterion = nn.NLLLoss()
learning_rate = 0.005
def train_RNN(category_tensor, line_tensor):
hidden = rnn.init_hidden()
rnn.zero_grad()
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)

loss = criterion(output.squeeze(0), category_tensor)
loss.backward()
for p in rnn.parameters():
p.data.add_(learning_rate, p.grad.data)

return output, loss.item()

构建LSTM训练函数

1
2
3
4
5
6
7
8
9
10
def train_LSTM(category_tensor, line_tensor):
hidden, c = lstm.init_hidden_and_c()
lstm.zero_grad()
for i in range(line_tensor.size()[0]):
output, hidden, c = lstm(line_tensor[i], hidden, c)
loss = criterion(output.squeeze(0), category_tensor)
loss.backward()
for p in lstm.parameters():
p.data.add_(-learning_rate, p.grad.data)
return output, loss.item()

构建GRU训练函数

1
2
3
4
5
6
7
8
9
10
def train_GRU(category_tensor, line_tensor):
hidden = gru.init_hidden()
gru.zero_grad()
for i in range(line_tensor.size()[0]):
output, hidden = gru(line_tensor[i], hidden)
loss = criterion(output.squeeze(0), category_tensor)
loss.backward()
for p in gru.parameters():
p.data.add_(-learning_rate, p.grad.data)
return output, loss.item()

构建时间计算函数

1
2
3
4
5
6
def time_since(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m*60
return "%dm %ds" % (m, s)

构建训练过程的日志打印函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
n_iters = 1000
print_every = 50
plot_every = 10
def train(train_type_fn):
all_losses = []
start = time.time()
current_loss = 0
for iter in range(1, n_iters+1):
category, line, category_tensor, line_tensor = random_train_example()
output, loss = train_type_fn(category_tensor, line_tensor)
current_loss += loss
if iter % print_every == 0:
guess, guess_i = category_from_output(output)
correct = '✓' if guess == category else '✗ (%s)' % category
print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, time_since(start), loss, line, guess, correct))
if iter % plot_every == 0:
all_losses.append(current_loss / plot_every)
current_loss = 0
return all_losses, int(time.time() - start)

训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# 调用train函数, 分别进行RNN, LSTM, GRU模型的训练
# 并返回各自的全部损失, 以及训练耗时用于制图
all_losses1, period1 = train(train_RNN)
all_losses2, period2 = train(train_LSTM)
all_losses3, period3 = train(train_GRU)

# 绘制损失对比曲线, 训练耗时对比柱张图
# 创建画布0
plt.figure(0)
# 绘制损失对比曲线
plt.plot(all_losses1, label="RNN")
plt.plot(all_losses2, color="red", label="LSTM")
plt.plot(all_losses3, color="orange", label="GRU")
plt.legend(loc='upper left')


# 创建画布1
plt.figure(1)
x_data=["RNN", "LSTM", "GRU"]
y_data = [period1, period2, period3]
# 绘制训练耗时对比柱状图
plt.bar(range(len(x_data)), y_data, tick_label=x_data)

...

750 75% (0m 2s) 2.8973 Reijnders / Korean ✗ (Dutch)

800 80% (0m 3s) 2.9324 Huynh / Korean ✗ (Vietnamese)

850 85% (0m 3s) 2.7586 Suk / Korean ✓

900 90% (0m 3s) 2.9910 Stanek / Korean ✗ (Polish)

950 95% (0m 3s) 2.9644 Klimek / Greek ✗ (Polish)

1000 100% (0m 3s) 2.9666 Le / Korean ✗ (Vietnamese)

<BarContainer object of 3 artists>

结论

损失对比曲线分析:

  • 模型训练的损失降低快慢代表模型收敛程度,由图可知,传统RNN的模型收敛情况最好,然后是GRU,最后是LSTM,这是因为:当处理的文本数据是人名时,它们的长度有限,且长距离字母间基本无特定关联,因此无法发挥改进模型LSTM和GRU的长距离捕捉语义关联优势。所以在之后的模型选用时,要通过对任务的分析以及实验对比,选择最适合的模型。

训练耗时对比图分析:

  • 模型训练的耗时长短代表模型的计算复杂度,由图可知,也正如之前理论分析,传统RNN复杂度最低,耗时最少,然后是GRU,最后是LSTM。

结论:模型选用一般就通过实验对比,并非越复杂或越先进的模型越好,而是需要结合自己的特定任务,从对数据的分析和实验结果中获得最佳答案

构建评估函数并进行预测

构建传统RNN评估函数

1
2
3
4
5
def evaluate_RNN(line_tensor):
hidden = rnn.init_hidden()
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
return output.squeeze(0)

构建LSTM评估函数

1
2
3
4
5
def evaluate_LSTM(line_tensor):
hidden, c = lstm.init_hidden_and_c()
for i in range(line_tensor.size()[0]):
output, hidden, c = lstm(line_tensor[i], hidden, c)
return output.squeeze(0)

构建GRU评估函数

1
2
3
4
5
def evaluate_GRU(line_tensor):
hidden = gru.init_hidden()
for i in range(line_tensor.size()[0]):
output, hidden = gru(line_tensor[i], hidden)
return output.squeeze(0)

测试

1
2
3
4
5
6
7
8
line = "Bai"
line_tensor = line_to_tensor(line)
rnn_output = evaluate_RNN(line_tensor)
lstm_output = evaluate_LSTM(line_tensor)
gru_output = evaluate_GRU(line_tensor)
print("rnn_output:", rnn_output)
print("lstm_output:", lstm_output)
print("gru_output:", gru_output)

rnn_output: tensor([[-5965.0801, 0.0000, -5944.7900, -5938.4595, -5962.0166, -5947.8330, -5934.5571, -5944.6787, -5954.0552, -5939.5146, -5950.5410, -5955.7983, -5929.8423, -5952.1704, -5936.9551, -5948.0830, -5970.2207, -5947.5967]], grad_fn=)

lstm_output: tensor([[-2.8458, -2.9780, -2.8754, -2.9936, -2.8406, -2.9247, -3.0767, -2.8284, -2.9451, -2.8750, -2.8369, -2.7955, -2.8501, -2.8687, -2.9414, -2.8326, -2.8425, -2.9191]], grad_fn=)

gru_output: tensor([[-2.9983, -2.6126, -2.9359, -2.9278, -2.8107, -3.0219, -3.0625, -2.9407, -3.0557, -2.8054, -2.6451, -2.9002, -2.7039, -2.7317, -3.1449, -2.9193, -2.9134, -3.1068]], grad_fn=)

构建预测函数

1
2
3
4
5
6
7
8
9
10
11
def predict(input_line, evaluate, n_predictions=3):
print("\n> %s" % input_line)
with torch.no_grad():
output = evaluate(line_to_tensor(input_line))
topv, topi = output.topk(n_predictions, 1, True)
predictions = []
for i in range(n_predictions):
value = topv[0][i].item()
category_index = topi[0][i].item()
print("(%.2f) %s" % (value, all_categories[category_index]))
predictions.append([value, all_categories[category_index]])
1
2
3
4
5
6
# 调用
for evaluate_fn in [evaluate_RNN, evaluate_LSTM, evaluate_GRU]:
print("-"*18)
predict("Dovesky", evaluate_fn)
predict("Jackson", evaluate_fn)
predict("Satoshi", evaluate_fn)

> Dovesky

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish

> Jackson

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish

> Satoshi

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish


> Dovesky

(-2.80) English

(-2.81) Portuguese

(-2.82) Arabic

> Jackson

(-2.80) Portuguese

(-2.82) English

(-2.82) Arabic

> Satoshi

(-2.80) Arabic

(-2.82) English

(-2.82) Portuguese


> Dovesky

(-2.62) Czech

(-2.67) Polish

(-2.69) Greek

> Jackson

(-2.74) Scottish

(-2.75) English

(-2.77) Russian

> Satoshi

(-2.61) Greek

(-2.64) Italian

(-2.70) Czech