当前位置:网站首页>C2w model - language model
C2w model - language model
2022-06-12 06:07:00 【Singing under the hedge】
List of articles
C2W Model - Language model ( two-way LSTM)

One 、 File directory

Two 、 corpus (wiki_00)

3、 ... and 、 Create training , test , Verification set
import json
import nltk
# Import data
datas = open("./wiki_00",encoding="utf-8").read().splitlines()
# Create training , test , Verification set
f_train = open("train.txt","w",encoding="utf-8")
f_valid = open("valid.txt","w",encoding="utf-8")
f_test = open("test.txt","w",encoding="utf-8")
num_words = 0
for data in datas:
data = json.loads(data,strict=False) # Turn into json
sentences = data["text"] # Extract the sentence content
sentences = sentences.replace("\n\n",".")
sentences = sentences.replace("\n",".")
sentences = nltk.sent_tokenize((sentences)) # Extract sentences
for sentence in sentences:
sentence = nltk.word_tokenize(sentence)
if len(sentence)<10 or len(sentence)>100:
continue
num_words = num_words +len(sentence)
sentence =" ".join(sentence)+"\n"
if num_words <=1000000:
f_train.write(sentence)
elif num_words <= 1020000:
f_valid.write(sentence)
elif num_words <=1040000:
f_test.write(sentence)
else:
exit()
Four 、 structure word2id,char2id, features , label
# Load training set
import os.path
import pickle
import numpy as np
from collections import Counter
from torch.utils import data
class Char_LM_Dataset(data.DataLoader):
def __init__(self,mode="train",max_word_length=16,max_sentence_length=100):
self.path = os.path.abspath('.')
self.mode = mode
self.max_word_length = max_word_length
self.max_sentence_length = max_sentence_length
datas = self.read_file()
datas, char_datas, weights = self.generate_data_label(datas)
self.datas = datas.reshape([-1]) #(3592800,)
self.char_datas = char_datas.reshape([-1, self.max_word_length])#(3592800, 16)
self.weights = weights#(3592800,)
print(self.datas.shape, self.char_datas.shape, weights.shape)
def __getitem__(self, index):
return self.char_datas[index], self.datas[index],self.weights[index]
def __len__(self):
return len(self.datas)
def read_file(self):
if self.mode == "train":
datas = open(self.path+"/train.txt",encoding="utf-8").read().strip("\n").splitlines()
datas = [s.split() for s in datas]
# structure word2id,char2id
if not os.path.exists(self.path+"/word2id"):
words = []
chars = []
for data in datas:
for word in data:
words.append(word.lower())
chars.extend(word)
# 5000 Words count the number of occurrences ,509 Letters count the number of occurrences
words = dict(Counter(words).most_common(5000 - 2))
chars = dict(Counter(chars).most_common(512 - 3))
word2id = {
"<pad>": 0, "<unk>": 1}
for word in words:
word2id[word] = len(word2id)
char2id = {
"<pad>": 0, "<unk>": 1, "<start>": 2}
for char in chars:
char2id[char] = len(char2id)
self.word2id = word2id
self.char2id = char2id
pickle.dump(self.word2id,open(self.path+"/word2id","wb"))
pickle.dump(self.char2id,open(self.path+"/char2id","wb"))
else:
self.word2id = pickle.load(open(self.path+"/word2id","rb"))
self.char2id = pickle.load(open(self.path+"/char2id","rb"))
return datas
elif self.mode == "valid":
datas = open(self.path + "/valid.txt", encoding="utf-8").read().strip("\n").splitlines()
datas = [s.split() for s in datas]
self.word2id = pickle.load(open(self.path + "/word2id", "rb"))
self.char2id = pickle.load(open(self.path + "/char2id", "rb"))
return datas
elif self.mode == "test":
datas = open(self.path + "/test.txt", encoding="utf-8").read().strip("\n").splitlines()
datas = [s.split() for s in datas]
self.word2id = pickle.load(open(self.path + "/word2id", "rb"))
self.char2id = pickle.load(open(self.path + "/char2id", "rb"))
return datas
def generate_data_label(self, datas):
# Build features and labels
char_datas = []
weights = []
for i, data in enumerate(datas):
if i % 1000 == 0:
print(i, len(datas))
char_data = [[self.char2id["<start>"]] * self.max_word_length]
for j, word in enumerate(data):
char_word = []
for char in word:
char_word.append(self.char2id.get(char, self.char2id["<unk>"])) # Convert characters to id
char_word = char_word[0:self.max_word_length] + \
[self.char2id["<pad>"]] * (self.max_word_length - len(char_word)) # Put all words pad In the same length
datas[i][j] = self.word2id.get(datas[i][j].lower(), self.word2id["<unk>"]) # Turn words into id
char_data.append(char_word)
weights.extend([1] * len(datas[i]) + [0] * (self.max_sentence_length - len(datas[i])))
datas[i] = datas[i][0:self.max_sentence_length] + [self.word2id["<pad>"]] * (self.max_sentence_length - len(datas[i])) # Put all the sentences pad In the same length
char_datas.append(char_data)
char_datas[i] = char_datas[i][0:self.max_sentence_length] + \
[[self.char2id["<pad>"]] * self.max_word_length] * (self. max_sentence_length - len(char_datas[i])) # Put all the sentences pad In the same length
datas = np.array(datas)
char_datas = np.array(char_datas)
weights = np.array(weights)
return datas, char_datas, weights
if __name__=="__main__":
char_lm_dataset = Char_LM_Dataset()
5、 ... and 、C2W model building
import torch
import torch.nn as nn
import numpy as np
class C2W(nn.Module):
def __init__(self, config):
super(C2W, self).__init__()
self.char_hidden_size = config.char_hidden_size
self.word_embed_size = config.word_embed_size
self.lm_hidden_size = config.lm_hidden_size
self.character_embedding = nn.Embedding(config.n_chars,config.char_embed_size) # Character embedding layer ,64,50
self.sentence_length = config.max_sentence_length
self.char_lstm = nn.LSTM(input_size=config.char_embed_size,hidden_size=config.char_hidden_size,
bidirectional=True,batch_first=True) # character lstm,50,50,
self.lm_lstm = nn.LSTM(input_size=self.word_embed_size,hidden_size=config.lm_hidden_size,batch_first=True) # Language model lstm.50,150
self.fc_1 = nn.Linear(2*config.char_hidden_size,config.word_embed_size) # The linear combination generating word means
self.fc_2 =nn.Linear(config.lm_hidden_size,config.vocab_size) # Generate categories for forecasting
def forward(self, x):
input = self.character_embedding(x) #[64, 16, 50]
char_lstm_result = self.char_lstm(input) #[64, 16, 100]
word_input = torch.cat([char_lstm_result[0][:,-1,0:self.char_hidden_size],
char_lstm_result[0][:,0,self.char_hidden_size:]],dim=1) #[64,100]
word_input = self.fc_1(word_input) #[64,50]
word_input = word_input.view([-1,self.sentence_length,self.word_embed_size]) #[8,8,50]
lm_lstm_result = self.lm_lstm(word_input)[0].contiguous() #[8, 8, 150]
lm_lstm_result = lm_lstm_result.view([-1,self.lm_hidden_size]) #[64, 150]
print(lm_lstm_result.shape)
out = self.fc_2(lm_lstm_result) #[64, 1000]
return out
class config:
def __init__(self):
self.n_chars = 64 # The number of characters
self.char_embed_size = 50 # Character embedding size
self.max_sentence_length = 8 # Maximum sentence length
self.char_hidden_size = 50 # character lstm The number of hidden layer neurons
self.lm_hidden_size = 150 # The number of hidden neurons in the language model
self.word_embed_size = 50 # The generated word represents size
config.vocab_size = 1000 # Vocabulary size
if __name__=="__main__":
config = config()
c2w = C2W(config)
test = np.zeros([64,16])
c2w(test)
6、 ... and 、 Training and testing
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from model import C2W
from data_load import Char_LM_Dataset
from tqdm import tqdm
import config as argumentparser
config = argumentparser.ArgumentParser()
def get_test_result(data_iter,data_set):
# Generate test results
model.eval()
all_ppl = 0
for data, label,weights in data_iter:
if config.cuda and torch.cuda.is_available():
data = data.cuda()
label = label.cuda()
weights = weights.cuda()
else:
data = torch.autograd.Variable(data).long()
label = torch.autograd.Variable(label).squeeze()
out = model(data)
loss_now = criterion(out, autograd.Variable(label.long()))
ppl = (loss_now * weights.float()).view([-1, config.max_sentence_length])
ppl = torch.sum(ppl, dim=1) / torch.sum((weights.view([-1, config.max_sentence_length])) != 0, dim=1).float()
ppl = torch.sum(torch.exp(ppl))
all_ppl += ppl.data.item()
return all_ppl*config.max_sentence_length/data_set.__len__()
if __name__=="__main__":
# Judge whether it can use cuda
if config.cuda and torch.cuda.is_available():
torch.cuda.set_device(config.gpu)
# Import training set
training_set = Char_LM_Dataset(mode="train")
training_iter = torch.utils.data.DataLoader(dataset=training_set,
batch_size=config.batch_size * config.max_sentence_length,
shuffle=False,
num_workers=2)
# Import validation set
valid_set = Char_LM_Dataset(mode="valid")
valid_iter = torch.utils.data.DataLoader(dataset=valid_set,
batch_size=config.batch_size * config.max_sentence_length,
shuffle=False,
num_workers=0)
# Import test set
test_set = Char_LM_Dataset(mode="test")
test_iter = torch.utils.data.DataLoader(dataset=test_set,
batch_size=32 * 100,
shuffle=False,
num_workers=0)
model = C2W(config)
if config.cuda and torch.cuda.is_available():
model.cuda()
criterion = nn.CrossEntropyLoss(reduce=False)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
loss = -1
for epoch in range(config.epoch):
model.train()
process_bar = tqdm(training_iter)
for data, label,weights in process_bar:
if config.cuda and torch.cuda.is_available():
data = data.cuda()
label = label.cuda()
weights = weights.cuda()
else:
data = torch.autograd.Variable(data).long()
label = torch.autograd.Variable(label).squeeze()
out = model(data)
loss_now = criterion(out, autograd.Variable(label.long()))
ppl = (loss_now*weights.float()).view([-1,config.max_sentence_length])
ppl = torch.sum(ppl,dim=1)/torch.sum((weights.view([-1,config.max_sentence_length]))!=0,dim=1).float()
ppl = torch.mean(torch.exp(ppl))
loss_now = torch.sum(loss_now*weights.float())/torch.sum(weights!=0)
if loss==-1:
loss = loss_now.data.item()
else:
loss = 0.95 * loss + 0.05 * loss_now.data.item()
process_bar.set_postfix(loss=loss,ppl=ppl.data.item())
process_bar.update()
optimizer.zero_grad()
loss_now.backward()
optimizer.step()
print ("Valid ppl is:",get_test_result(valid_iter,valid_set))
print ("Test ppl is:",get_test_result(test_iter,valid_set))
experimental result
The degree of confusion of the output language model .
边栏推荐
- Front desk display LED number (number type on calculator)
- E-book analysis
- Recursive implementation of exponential, permutation and combination enumerations
- Login authentication filter
- EBook list page
- Open the camera in unity3d and display the contents of the camera in the scene as texture2d
- A month's worth of DDD will help you master it
- Makefile文件编写快速掌握
- Automatic annotation of target detection based on lffd model to generate XML file
- Leetcode-553. Optimal division
猜你喜欢

IBL of directx11 advanced tutorial PBR (3)

肝了一個月的 DDD,一文帶你掌握

Why don't databases use hash tables?

MySQL master-slave, 6 minutes to master

In unity3d, billboard effect can be realized towards another target

User login (medium)

Univariate linear regression model

A preliminary understanding of function

从传统网络IO 到 IO多路复用

Annotation configuration of filter
随机推荐
JS变量作用域
Why doesn't the database use binary tree, red black tree, B tree and hash table? Instead, a b+ tree is used
[Yu Yue education] basic reference materials of accounting of Nanjing Normal University
(UE4 4.27) customize primitivecomponent
Project and build Publishing
How to increase heap size of JVM [duplicate] - how to increase heap size of JVM [duplicate]
Leetcode-1043. Separate arrays for maximum sum
Nrf52832 custom services and features
Directx11 advanced tutorial cluster based deffered shading
(UE4 4.27) UE4 adds a customized meshpass to realize the edge illumination of the mobile terminal
交叉编译libev
Analysis of memory management mechanism of (UE4 4.26) UE4 uobject
Project management and overall planning
Leetcode-1512. Number of good pairs
zip 和.items()区别
Why don't databases use hash tables?
C # converts the hexadecimal code form of text to text (ASCII)
Mysql笔记
Unity C script implements AES encryption and decryption
肝了一個月的 DDD,一文帶你掌握