人名分类器

关于人名分类问题

以一个人名为输入，使用模型帮助我们判断它最有可能来自哪个国家的人名，这在某些国际化公司的业务中具有重要意义，在用户注册过程中，会根据用户填写的名字直接给他分配可能的国家或地区选项，以及该国家或地区的国旗，限制手机号码位数等

人名分类数据

数据下载地址

实现步骤

导入必备工具包
对data文件中的数据进行处理，满足训练要求
构建RNN模型(包括传统RNN、LSTM及GRU)
构建训练函数并进行训练
构建评估函数并进行预测

导入必备工具包

# 从io中导入文件打开方法
from io import open
# 帮助使用正则表达式子目录的查询
import glob
import os
# 用于获得常见字母及字符规范化
import string
import unicodedata
# 导入随机工具random
import random
# 导入时间和数学工具包
import time
import math
# 导入torch工具
import torch
# 导入nn准备构建模型
import torch.nn as nn
# 引入制图工具包
import matplotlib.pyplot as plt

对data文件中的数据进行处理，满足训练要求

# 获取所有常用字符包括字母和常用标点
all_letters = string.ascii_letters + ".,;"
# 获取常用字符数量
n_letters = len(all_letters)
print("n_letter:", n_letters)

n_letter: 55

# 字符规范化-unicode转Ascii函数
def unicodeToAscii(s):
    return ''.join(
    	c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# 测试
s = "Iñíguez"
a = unicodeToAscii(s)
print(a)

Iniguez

# 从从持久化文件中读取的函数
data_path = "./data/names/"
def read_lines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]
# 测试
filename = data_path + "Chinese.txt"
lines = read_lines(filename)
print(lines[:10])

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao', 'Bei', 'Bian', 'Bui', 'Cai', 'Cao']

# 构建人名类别列表和人名对应关系字典
category_lines = {}
all_categories = []
for filename in glob.glob(data_path + "*.txt"):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines

# 查看类别总数
n_categories = len(all_categories)
print("n_categories:", n_categories)
# 查看一些内容
print(category_lines['Italian'])

n_categories: 18

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

# 将人名转化对应onehot张量
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

line = "Bai"
print("line_tensor:", line_to_tensor(line))

line_tensor: tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

构建RNN模型

构建传统RNN模型

# 使用nn.RNN构建完成传统RNN使用类

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        """初始化函数中有4个参数, 分别代表RNN输入最后一维尺寸, RNN的隐层最后一维尺寸, RNN层数"""
        super(RNN, self).__init__()       
        # 将hidden_size与num_layers传入其中
        self.hidden_size = hidden_size
        self.num_layers = num_layers  

        # 实例化预定义的nn.RNN, 它的三个参数分别是input_size, hidden_size, num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers)
        # 实例化nn.Linear, 这个线性层用于将nn.RNN的输出维度转化为指定的输出维度
        self.linear = nn.Linear(hidden_size, output_size)
        # 实例化nn中预定的Softmax层, 用于从输出层获得类别结果
        self.softmax = nn.LogSoftmax(dim=-1)


    def forward(self, input, hidden):
        """完成传统RNN中的主要逻辑, 输入参数input代表输入张量, 它的形状是1 x n_letters
           hidden代表RNN的隐层张量, 它的形状是self.num_layers x 1 x self.hidden_size"""
        # 因为预定义的nn.RNN要求输入维度一定是三维张量, 因此在这里使用unsqueeze(0)扩展一个维度
        input = input.unsqueeze(0)
        # 将input和hidden输入到传统RNN的实例化对象中，如果num_layers=1, rr恒等于hn
        rr, hn = self.rnn(input, hidden)
        # 将从RNN中获得的结果通过线性变换和softmax返回，同时返回hn作为后续RNN的输入
        return self.softmax(self.linear(rr)), hn


    def init_hidden(self):
        """初始化隐层张量"""
        # 初始化一个（self.num_layers, 1, self.hidden_size）形状的0张量     
        return torch.zeros(self.num_layers, 1, self.hidden_size)

构建LSTM模型

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, input, hidden, c):
        input = input.unsqueeze(0)
        rr, (hn, c) = self.lstm(input, (hidden, c))
        return self.softmax(self.linear(rr)), hn, c
    
    def init_hidden_and_c(self):
        c = hidden = torch.zeros(self.num_layers, 1, self.hidden_size)
        return hidden, c

构建GRU模型

class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)
        
    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        rr, hn = self.gru(input, hidden)
        return self.softmax(self.linear(rr)), hn
    
    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size)

input_size = n_letters
n_hidden = 128 # 定义隐层最后一维尺寸大小
output_size = n_categories
input = line_to_tensor('B').squeeze(0)
hidden = c = torch.zeros(1, 1, n_hidden)
rnn = RNN(input_size, n_hidden, output_size)
lstm = LSTM(input_size, n_hidden, output_size)
gru = GRU(input_size, n_hidden, output_size)

rnn_output, next_hidden = rnn(input, hidden)
print("rnn:", rnn_output)
print(rnn_output.shape)
lstm_output, next_hidden, c = lstm(input, hidden, c)
print("lstm:", lstm_output)
print(lstm_output.shape)
gru_output, next_hidden = gru(input, hidden)
print("gru:", gru_output)
print(gru_output.shape)

rnn: tensor([[[-2.7586, -2.9267, -2.9233, -2.9483, -2.9034, -2.8620, -3.0104, -2.9470, -2.7736, -2.8324, -2.8893, -2.9148, -2.8490, -2.9128, -3.0092, -2.8207, -2.8846, -2.9011]]], grad_fn=)

torch.Size([1, 1, 18])

lstm: tensor([[[-2.8870, -2.8785, -2.8590, -2.9680, -2.8337, -2.8333, -2.8774, -2.9016, -2.8173, -2.9711, -2.9113, -2.8614, -2.8533, -2.8560, -2.8948, -2.9332, -2.9586, -2.9509]]], grad_fn=)

torch.Size([1, 1, 18])

gru: tensor([[[-2.9899, -2.8605, -2.8580, -2.8376, -2.9263, -2.9115, -2.8091, -2.9688, -2.9039, -2.9790, -2.8622, -2.9508, -2.8404, -2.9036, -2.8456, -2.8766, -2.8663, -2.8602]]], grad_fn=)

torch.Size([1, 1, 18])

构建训练函数并进行训练

def category_from_output(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i
x = torch.arange(1., 6.)
print(x)
print(torch.topk(x, 3))

tensor([1., 2., 3., 4., 5.])

torch.return_types.topk( values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))

1
2
3

output = gru_output
category, category_i = category_from_output(output)
print(category, category_i)

Portuguese 15

# 随机产生训练数据
def random_train_example():
    category = random.choice(all_categories)
    line = random.choice(category_lines[category])
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category, line, category_tensor, line_tensor
# 测试
for i in range(10):
    category, line, category_tensor, line_tensor = random_train_example()
    print("category=",category,"/ line=", line, "/category_tensor=", category_tensor)

category= Czech / line= Tykal /category_tensor= tensor([5])

category= Japanese / line= Tsukatani /category_tensor= tensor([9])

category= Chinese / line= Xin /category_tensor= tensor([0])

category= Italian / line= Albanesi /category_tensor= tensor([12])

category= Japanese / line= Fujimaki /category_tensor= tensor([9])

category= Scottish / line= Black /category_tensor= tensor([14])

category= French / line= Bernard /category_tensor= tensor([7])

category= Greek / line= Tselios /category_tensor= tensor([8])

category= Chinese / line= Chen /category_tensor= tensor([0])

category= German / line= Hauer /category_tensor= tensor([16])

构建传统RNN训练函数

criterion = nn.NLLLoss()
learning_rate = 0.005
def train_RNN(category_tensor, line_tensor):
    hidden = rnn.init_hidden()
    rnn.zero_grad()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    
    loss = criterion(output.squeeze(0), category_tensor)
    loss.backward()
    for p in rnn.parameters():
        p.data.add_(learning_rate, p.grad.data)
       
    return output, loss.item()

构建LSTM训练函数

def train_LSTM(category_tensor, line_tensor):
    hidden, c = lstm.init_hidden_and_c()
    lstm.zero_grad()
    for i in range(line_tensor.size()[0]):
        output, hidden, c = lstm(line_tensor[i], hidden, c)
    loss = criterion(output.squeeze(0), category_tensor)
    loss.backward()
    for p in lstm.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()

构建GRU训练函数

def train_GRU(category_tensor, line_tensor):
    hidden = gru.init_hidden()
    gru.zero_grad()
    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
    loss = criterion(output.squeeze(0), category_tensor)
    loss.backward()
    for p in gru.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()

构建时间计算函数

def time_since(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m*60
    return "%dm %ds" % (m, s)

构建训练过程的日志打印函数

n_iters = 1000
print_every = 50
plot_every = 10
def train(train_type_fn):
    all_losses = []
    start = time.time()
    current_loss = 0
    for iter in range(1, n_iters+1):
        category, line, category_tensor, line_tensor = random_train_example()
        output, loss = train_type_fn(category_tensor, line_tensor)
        current_loss += loss
        if iter % print_every == 0:
            guess, guess_i = category_from_output(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, time_since(start), loss, line, guess, correct))
        if iter % plot_every == 0:
                all_losses.append(current_loss / plot_every)
                current_loss = 0
    return all_losses, int(time.time() - start)

训练

# 调用train函数, 分别进行RNN, LSTM, GRU模型的训练
# 并返回各自的全部损失, 以及训练耗时用于制图
all_losses1, period1 = train(train_RNN)
all_losses2, period2 = train(train_LSTM)
all_losses3, period3 = train(train_GRU)

# 绘制损失对比曲线, 训练耗时对比柱张图
# 创建画布0
plt.figure(0)
# 绘制损失对比曲线
plt.plot(all_losses1, label="RNN")
plt.plot(all_losses2, color="red", label="LSTM")
plt.plot(all_losses3, color="orange", label="GRU") 
plt.legend(loc='upper left') 


# 创建画布1
plt.figure(1)
x_data=["RNN", "LSTM", "GRU"] 
y_data = [period1, period2, period3]
# 绘制训练耗时对比柱状图
plt.bar(range(len(x_data)), y_data, tick_label=x_data)

...

750 75% (0m 2s) 2.8973 Reijnders / Korean ✗ (Dutch)

800 80% (0m 3s) 2.9324 Huynh / Korean ✗ (Vietnamese)

850 85% (0m 3s) 2.7586 Suk / Korean ✓

900 90% (0m 3s) 2.9910 Stanek / Korean ✗ (Polish)

950 95% (0m 3s) 2.9644 Klimek / Greek ✗ (Polish)

1000 100% (0m 3s) 2.9666 Le / Korean ✗ (Vietnamese)

<BarContainer object of 3 artists>

结论

损失对比曲线分析：

模型训练的损失降低快慢代表模型收敛程度，由图可知，传统RNN的模型收敛情况最好，然后是GRU，最后是LSTM，这是因为：当处理的文本数据是人名时，它们的长度有限，且长距离字母间基本无特定关联，因此无法发挥改进模型LSTM和GRU的长距离捕捉语义关联优势。所以在之后的模型选用时，要通过对任务的分析以及实验对比，选择最适合的模型。

训练耗时对比图分析：

模型训练的耗时长短代表模型的计算复杂度，由图可知，也正如之前理论分析，传统RNN复杂度最低，耗时最少，然后是GRU，最后是LSTM。

结论：模型选用一般就通过实验对比，并非越复杂或越先进的模型越好，而是需要结合自己的特定任务，从对数据的分析和实验结果中获得最佳答案

构建评估函数并进行预测

构建传统RNN评估函数

def evaluate_RNN(line_tensor):
    hidden = rnn.init_hidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    return output.squeeze(0)

构建LSTM评估函数

def evaluate_LSTM(line_tensor):
    hidden, c = lstm.init_hidden_and_c()
    for i in range(line_tensor.size()[0]):
        output, hidden, c = lstm(line_tensor[i], hidden, c)
    return output.squeeze(0)

构建GRU评估函数

def evaluate_GRU(line_tensor):
    hidden = gru.init_hidden()
    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
	return output.squeeze(0)

测试

line = "Bai"
line_tensor = line_to_tensor(line)
rnn_output = evaluate_RNN(line_tensor)
lstm_output = evaluate_LSTM(line_tensor)
gru_output = evaluate_GRU(line_tensor)
print("rnn_output:", rnn_output)
print("lstm_output:", lstm_output)
print("gru_output:", gru_output)

rnn_output: tensor([[-5965.0801, 0.0000, -5944.7900, -5938.4595, -5962.0166, -5947.8330, -5934.5571, -5944.6787, -5954.0552, -5939.5146, -5950.5410, -5955.7983, -5929.8423, -5952.1704, -5936.9551, -5948.0830, -5970.2207, -5947.5967]], grad_fn=)

lstm_output: tensor([[-2.8458, -2.9780, -2.8754, -2.9936, -2.8406, -2.9247, -3.0767, -2.8284, -2.9451, -2.8750, -2.8369, -2.7955, -2.8501, -2.8687, -2.9414, -2.8326, -2.8425, -2.9191]], grad_fn=)

gru_output: tensor([[-2.9983, -2.6126, -2.9359, -2.9278, -2.8107, -3.0219, -3.0625, -2.9407, -3.0557, -2.8054, -2.6451, -2.9002, -2.7039, -2.7317, -3.1449, -2.9193, -2.9134, -3.1068]], grad_fn=)

构建预测函数

def predict(input_line, evaluate, n_predictions=3):
    print("\n> %s" % input_line)
    with torch.no_grad():
        output = evaluate(line_to_tensor(input_line))
        topv, topi = output.topk(n_predictions, 1, True)
        predictions = []
        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print("(%.2f) %s" % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

# 调用
for evaluate_fn in [evaluate_RNN, evaluate_LSTM, evaluate_GRU]:
    print("-"*18)
    predict("Dovesky", evaluate_fn)
    predict("Jackson", evaluate_fn)
    predict("Satoshi", evaluate_fn)

> Dovesky

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish

> Jackson

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish

> Satoshi

(0.00) Italian

(-5929.84) Czech

(-5934.56) Irish

> Dovesky

(-2.80) English

(-2.81) Portuguese

(-2.82) Arabic

> Jackson

(-2.80) Portuguese

(-2.82) English

(-2.82) Arabic

> Satoshi

(-2.80) Arabic

(-2.82) English

(-2.82) Portuguese

> Dovesky

(-2.62) Czech

(-2.67) Polish

(-2.69) Greek

> Jackson

(-2.74) Scottish

(-2.75) English

(-2.77) Russian

> Satoshi

(-2.61) Greek

(-2.64) Italian

(-2.70) Czech