人名分类器
关于人名分类问题
以一个人名为输入,使用模型帮助我们判断它最有可能来自哪个国家的人名,这在某些国际化公司的业务中具有重要意义,在用户注册过程中, 会根据用户填写的名字直接给他分配可能的国家或地区选项,以及该国家或地区的国旗,限制手机号码位数等
人名分类数据
实现步骤
导入必备工具包
对data文件中的数据进行处理,满足训练要求
构建RNN模型(包括传统RNN、LSTM及GRU)
构建训练函数并进行训练
构建评估函数并进行预测
导入必备工具包
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from io import open import globimport osimport stringimport unicodedataimport randomimport timeimport mathimport torchimport torch.nn as nnimport matplotlib.pyplot as plt
对data文件中的数据进行处理,满足训练要求
1 2 3 4 5 all_letters = string.ascii_letters + ".,;" n_letters = len (all_letters) print ("n_letter:" , n_letters)
n_letter: 55
1 2 3 4 5 6 7 def unicodeToAscii (s ): return '' .join( c for c in unicodedata.normalize('NFD' , s) if unicodedata.category(c) != 'Mn' and c in all_letters )
1 2 3 4 s = "Iñíguez" a = unicodeToAscii(s) print (a)
Iniguez
1 2 3 4 5 6 7 8 9 data_path = "./data/names/" def read_lines (filename ): lines = open (filename, encoding='utf-8' ).read().strip().split('\n' ) return [unicodeToAscii(line) for line in lines] filename = data_path + "Chinese.txt" lines = read_lines(filename) print (lines[:10 ])
['Ang', 'AuYong', 'Bai', 'Ban', 'Bao', 'Bei', 'Bian', 'Bui', 'Cai', 'Cao']
1 2 3 4 5 6 7 8 9 10 11 12 13 14 category_lines = {} all_categories = [] for filename in glob.glob(data_path + "*.txt" ): category = os.path.splitext(os.path.basename(filename))[0 ] all_categories.append(category) lines = read_lines(filename) category_lines[category] = lines n_categories = len (all_categories) print ("n_categories:" , n_categories)print (category_lines['Italian' ])
n_categories: 18
['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']
1 2 3 4 5 6 7 8 9 def line_to_tensor (line ): tensor = torch.zeros(len (line), 1 , n_letters) for li, letter in enumerate (line): tensor[li][0 ][all_letters.find(letter)] = 1 return tensor line = "Bai" print ("line_tensor:" , line_to_tensor(line))
line_tensor: tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
构建RNN模型
构建传统RNN模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 class RNN (nn.Module ): def __init__ (self, input_size, hidden_size, output_size, num_layers=1 ): """初始化函数中有4个参数, 分别代表RNN输入最后一维尺寸, RNN的隐层最后一维尺寸, RNN层数""" super (RNN, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.rnn = nn.RNN(input_size, hidden_size, num_layers) self.linear = nn.Linear(hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=-1 ) def forward (self, input , hidden ): """完成传统RNN中的主要逻辑, 输入参数input代表输入张量, 它的形状是1 x n_letters hidden代表RNN的隐层张量, 它的形状是self.num_layers x 1 x self.hidden_size""" input = input .unsqueeze(0 ) rr, hn = self.rnn(input , hidden) return self.softmax(self.linear(rr)), hn def init_hidden (self ): """初始化隐层张量""" return torch.zeros(self.num_layers, 1 , self.hidden_size)
构建LSTM模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 class LSTM (nn.Module ): def __init__ (self, input_size, hidden_size, output_size, num_layers=1 ): super (LSTM, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.lstm = nn.LSTM(input_size, hidden_size, num_layers) self.linear = nn.Linear(hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=-1 ) def forward (self, input , hidden, c ): input = input .unsqueeze(0 ) rr, (hn, c) = self.lstm(input , (hidden, c)) return self.softmax(self.linear(rr)), hn, c def init_hidden_and_c (self ): c = hidden = torch.zeros(self.num_layers, 1 , self.hidden_size) return hidden, c
构建GRU模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 class GRU (nn.Module ): def __init__ (self, input_size, hidden_size, output_size, num_layers=1 ): super (GRU, self).__init__() self.hidden_size = hidden_size self.num_layers = num_layers self.gru = nn.GRU(input_size, hidden_size, num_layers) self.linear = nn.Linear(hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=-1 ) def forward (self, input , hidden ): input = input .unsqueeze(0 ) rr, hn = self.gru(input , hidden) return self.softmax(self.linear(rr)), hn def init_hidden (self ): return torch.zeros(self.num_layers, 1 , self.hidden_size)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 input_size = n_letters n_hidden = 128 output_size = n_categories input = line_to_tensor('B' ).squeeze(0 )hidden = c = torch.zeros(1 , 1 , n_hidden) rnn = RNN(input_size, n_hidden, output_size) lstm = LSTM(input_size, n_hidden, output_size) gru = GRU(input_size, n_hidden, output_size) rnn_output, next_hidden = rnn(input , hidden) print ("rnn:" , rnn_output)print (rnn_output.shape)lstm_output, next_hidden, c = lstm(input , hidden, c) print ("lstm:" , lstm_output)print (lstm_output.shape)gru_output, next_hidden = gru(input , hidden) print ("gru:" , gru_output)print (gru_output.shape)
rnn: tensor([[[-2.7586, -2.9267, -2.9233, -2.9483, -2.9034, -2.8620, -3.0104, -2.9470, -2.7736, -2.8324, -2.8893, -2.9148, -2.8490, -2.9128, -3.0092, -2.8207, -2.8846, -2.9011]]], grad_fn=)
torch.Size([1, 1, 18])
lstm: tensor([[[-2.8870, -2.8785, -2.8590, -2.9680, -2.8337, -2.8333, -2.8774, -2.9016, -2.8173, -2.9711, -2.9113, -2.8614, -2.8533, -2.8560, -2.8948, -2.9332, -2.9586, -2.9509]]], grad_fn=)
torch.Size([1, 1, 18])
gru: tensor([[[-2.9899, -2.8605, -2.8580, -2.8376, -2.9263, -2.9115, -2.8091, -2.9688, -2.9039, -2.9790, -2.8622, -2.9508, -2.8404, -2.9036, -2.8456, -2.8766, -2.8663, -2.8602]]], grad_fn=)
torch.Size([1, 1, 18])
构建训练函数并进行训练
1 2 3 4 5 6 7 def category_from_output (output ): top_n, top_i = output.topk(1 ) category_i = top_i[0 ].item() return all_categories[category_i], category_i x = torch.arange(1. , 6. ) print (x)print (torch.topk(x, 3 ))
tensor([1., 2., 3., 4., 5.])
torch.return_types.topk( values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))
1 2 3 output = gru_output category, category_i = category_from_output(output) print (category, category_i)
Portuguese 15
1 2 3 4 5 6 7 8 9 10 11 def random_train_example (): category = random.choice(all_categories) line = random.choice(category_lines[category]) category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long) line_tensor = line_to_tensor(line) return category, line, category_tensor, line_tensor for i in range (10 ): category, line, category_tensor, line_tensor = random_train_example() print ("category=" ,category,"/ line=" , line, "/category_tensor=" , category_tensor)
category= Czech / line= Tykal /category_tensor= tensor([5])
category= Japanese / line= Tsukatani /category_tensor= tensor([9])
category= Chinese / line= Xin /category_tensor= tensor([0])
category= Italian / line= Albanesi /category_tensor= tensor([12])
category= Japanese / line= Fujimaki /category_tensor= tensor([9])
category= Scottish / line= Black /category_tensor= tensor([14])
category= French / line= Bernard /category_tensor= tensor([7])
category= Greek / line= Tselios /category_tensor= tensor([8])
category= Chinese / line= Chen /category_tensor= tensor([0])
category= German / line= Hauer /category_tensor= tensor([16])
构建传统RNN训练函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 criterion = nn.NLLLoss() learning_rate = 0.005 def train_RNN (category_tensor, line_tensor ): hidden = rnn.init_hidden() rnn.zero_grad() for i in range (line_tensor.size()[0 ]): output, hidden = rnn(line_tensor[i], hidden) loss = criterion(output.squeeze(0 ), category_tensor) loss.backward() for p in rnn.parameters(): p.data.add_(learning_rate, p.grad.data) return output, loss.item()
构建LSTM训练函数
1 2 3 4 5 6 7 8 9 10 def train_LSTM (category_tensor, line_tensor ): hidden, c = lstm.init_hidden_and_c() lstm.zero_grad() for i in range (line_tensor.size()[0 ]): output, hidden, c = lstm(line_tensor[i], hidden, c) loss = criterion(output.squeeze(0 ), category_tensor) loss.backward() for p in lstm.parameters(): p.data.add_(-learning_rate, p.grad.data) return output, loss.item()
构建GRU训练函数
1 2 3 4 5 6 7 8 9 10 def train_GRU (category_tensor, line_tensor ): hidden = gru.init_hidden() gru.zero_grad() for i in range (line_tensor.size()[0 ]): output, hidden = gru(line_tensor[i], hidden) loss = criterion(output.squeeze(0 ), category_tensor) loss.backward() for p in gru.parameters(): p.data.add_(-learning_rate, p.grad.data) return output, loss.item()
构建时间计算函数
1 2 3 4 5 6 def time_since (since ): now = time.time() s = now - since m = math.floor(s / 60 ) s -= m*60 return "%dm %ds" % (m, s)
构建训练过程的日志打印函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 n_iters = 1000 print_every = 50 plot_every = 10 def train (train_type_fn ): all_losses = [] start = time.time() current_loss = 0 for iter in range (1 , n_iters+1 ): category, line, category_tensor, line_tensor = random_train_example() output, loss = train_type_fn(category_tensor, line_tensor) current_loss += loss if iter % print_every == 0 : guess, guess_i = category_from_output(output) correct = '✓' if guess == category else '✗ (%s)' % category print ('%d %d%% (%s) %.4f %s / %s %s' % (iter , iter / n_iters * 100 , time_since(start), loss, line, guess, correct)) if iter % plot_every == 0 : all_losses.append(current_loss / plot_every) current_loss = 0 return all_losses, int (time.time() - start)
训练
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 all_losses1, period1 = train(train_RNN) all_losses2, period2 = train(train_LSTM) all_losses3, period3 = train(train_GRU) plt.figure(0 ) plt.plot(all_losses1, label="RNN" ) plt.plot(all_losses2, color="red" , label="LSTM" ) plt.plot(all_losses3, color="orange" , label="GRU" ) plt.legend(loc='upper left' ) plt.figure(1 ) x_data=["RNN" , "LSTM" , "GRU" ] y_data = [period1, period2, period3] plt.bar(range (len (x_data)), y_data, tick_label=x_data)
...
750 75% (0m 2s) 2.8973 Reijnders / Korean ✗ (Dutch)
800 80% (0m 3s) 2.9324 Huynh / Korean ✗ (Vietnamese)
850 85% (0m 3s) 2.7586 Suk / Korean ✓
900 90% (0m 3s) 2.9910 Stanek / Korean ✗ (Polish)
950 95% (0m 3s) 2.9644 Klimek / Greek ✗ (Polish)
1000 100% (0m 3s) 2.9666 Le / Korean ✗ (Vietnamese)
<BarContainer object of 3 artists>
结论
损失对比曲线分析:
模型训练的损失降低快慢代表模型收敛程度,由图可知,传统RNN的模型收敛情况最好,然后是GRU,最后是LSTM,这是因为:当处理的文本数据是人名时,它们的长度有限,且长距离字母间基本无特定关联,因此无法发挥改进模型LSTM和GRU的长距离捕捉语义关联优势。所以在之后的模型选用时,要通过对任务的分析以及实验对比,选择最适合的模型。
训练耗时对比图分析:
模型训练的耗时长短代表模型的计算复杂度,由图可知,也正如之前理论分析,传统RNN复杂度最低,耗时最少,然后是GRU,最后是LSTM。
结论:模型选用一般就通过实验对比,并非越复杂或越先进的模型越好,而是需要结合自己的特定任务,从对数据的分析和实验结果中获得最佳答案
构建评估函数并进行预测
构建传统RNN评估函数
1 2 3 4 5 def evaluate_RNN (line_tensor ): hidden = rnn.init_hidden() for i in range (line_tensor.size()[0 ]): output, hidden = rnn(line_tensor[i], hidden) return output.squeeze(0 )
构建LSTM评估函数
1 2 3 4 5 def evaluate_LSTM (line_tensor ): hidden, c = lstm.init_hidden_and_c() for i in range (line_tensor.size()[0 ]): output, hidden, c = lstm(line_tensor[i], hidden, c) return output.squeeze(0 )
构建GRU评估函数
1 2 3 4 5 def evaluate_GRU (line_tensor ): hidden = gru.init_hidden() for i in range (line_tensor.size()[0 ]): output, hidden = gru(line_tensor[i], hidden) return output.squeeze(0 )
测试
1 2 3 4 5 6 7 8 line = "Bai" line_tensor = line_to_tensor(line) rnn_output = evaluate_RNN(line_tensor) lstm_output = evaluate_LSTM(line_tensor) gru_output = evaluate_GRU(line_tensor) print ("rnn_output:" , rnn_output)print ("lstm_output:" , lstm_output)print ("gru_output:" , gru_output)
rnn_output: tensor([[-5965.0801, 0.0000, -5944.7900, -5938.4595, -5962.0166, -5947.8330, -5934.5571, -5944.6787, -5954.0552, -5939.5146, -5950.5410, -5955.7983, -5929.8423, -5952.1704, -5936.9551, -5948.0830, -5970.2207, -5947.5967]], grad_fn=)
lstm_output: tensor([[-2.8458, -2.9780, -2.8754, -2.9936, -2.8406, -2.9247, -3.0767, -2.8284, -2.9451, -2.8750, -2.8369, -2.7955, -2.8501, -2.8687, -2.9414, -2.8326, -2.8425, -2.9191]], grad_fn=)
gru_output: tensor([[-2.9983, -2.6126, -2.9359, -2.9278, -2.8107, -3.0219, -3.0625, -2.9407, -3.0557, -2.8054, -2.6451, -2.9002, -2.7039, -2.7317, -3.1449, -2.9193, -2.9134, -3.1068]], grad_fn=)
构建预测函数
1 2 3 4 5 6 7 8 9 10 11 def predict (input_line, evaluate, n_predictions=3 ): print ("\n> %s" % input_line) with torch.no_grad(): output = evaluate(line_to_tensor(input_line)) topv, topi = output.topk(n_predictions, 1 , True ) predictions = [] for i in range (n_predictions): value = topv[0 ][i].item() category_index = topi[0 ][i].item() print ("(%.2f) %s" % (value, all_categories[category_index])) predictions.append([value, all_categories[category_index]])
1 2 3 4 5 6 for evaluate_fn in [evaluate_RNN, evaluate_LSTM, evaluate_GRU]: print ("-" *18 ) predict("Dovesky" , evaluate_fn) predict("Jackson" , evaluate_fn) predict("Satoshi" , evaluate_fn)
> Dovesky
(0.00) Italian
(-5929.84) Czech
(-5934.56) Irish
> Jackson
(0.00) Italian
(-5929.84) Czech
(-5934.56) Irish
> Satoshi
(0.00) Italian
(-5929.84) Czech
(-5934.56) Irish
> Dovesky
(-2.80) English
(-2.81) Portuguese
(-2.82) Arabic
> Jackson
(-2.80) Portuguese
(-2.82) English
(-2.82) Arabic
> Satoshi
(-2.80) Arabic
(-2.82) English
(-2.82) Portuguese
> Dovesky
(-2.62) Czech
(-2.67) Polish
(-2.69) Greek
> Jackson
(-2.74) Scottish
(-2.75) English
(-2.77) Russian
> Satoshi
(-2.61) Greek
(-2.64) Italian
(-2.70) Czech