Emotional classification of 1.6 million comments on LSTM based on pytoch

2022-07-06 10:25:00 How about a song without trace

Data and code github Address    

explain : Training speed use cpu It's going to be slow
#  The goal is : Emotional categories 
#  Data sets  Sentiment140, Twitter Content on   contain 160 Ten thousand records ,0 :  negative , 2 :  Neutral , 4 :  positive 
#  But there is no neutrality in the dataset 
# 1、 Overall process :
# 2、 Import data 
# 3、 View data information 
# 4、 Data preprocessing :
#     ( Proportion of statistical categories ( Positive and negative )
#      Set labels and text 
#      Set the header 
#      Sample division ( Training and testing as well as validation are used to divide the data )
#      Building a vocabulary 
#      Vocabulary size is inconsistent padding)
# 5、 model building 
# 6、 model training 
# Import data 

import warnings
import pandas as pd
import matplotlib.pyplot as plt
# Reading data , engine  The default is C
dataset = pd.read_csv("./data/training.1600000.processed.noemoticon.csv",encoding="ISO-8859-1",engine='python',header = None)
 # View the... Of the data table shape
dataset.info() # View Datasheet information 
dataset.describe() #  Data sheet description 
# dataset.colums # Name 
dataset.head() # Before default 5 That's ok 
dataset['sentiment_category'] = dataset[0].astype('category') #  Type conversion -》 Categorical variables 
dataset['sentiment_category'].value_counts() #  Count the number of each category 
dataset['sentiment'] = dataset['sentiment_category'].cat.codes #  The classification variable value is converted to  0  and  1  Two categories 
dataset.to_csv('./data/train-processed.csv',header = None, index = None) # Save the file 
# Random selection 10000 Samples as a test set 
dataset.sample(10000).to_csv("./data/test_sample.csv",header = None,index = None)
# Set labels and text 
from torchtext.legacy import data
from torchtext.legacy.data import Field,TabularDataset,Iterator,BucketIterator

LABEL = data.LabelField() #  label 
CONTEXT = data.Field(lower = True) # Content and text 
# Set the header 
fields = [('score',None),('id',None),('data',None),('query',None),('name',None),
# Reading data 
contextDataset = data.TabularDataset(
    path = './data/train-processed.csv',
    format = 'CSV',
    fields = fields,
    skip_header = False
#  Separate  train, test, val
train, test, val = contextDataset.split(split_ratio=[0.8, 0.1, 0.1], stratified=True, strata_field='label')
# Show a sample 

# Building a vocabulary 
vocab_size = 20000
CONTEXT.build_vocab(train, max_size = vocab_size)

# Vocabulary size 
print(len(CONTEXT.vocab)) # unk -->  Unknown words ,pad -->  fill 
# Look at the most common words in the vocabulary 

#  Vocabulary size 
print(CONTEXT.vocab.itos[:10]) # Index to words 
print(CONTEXT.vocab.stoi) # Words to index 

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu' # For setting CPU  still gpu

#  Text batch , That is, read data batch by batch 
train_iter , val_iter, test_iter = data.BucketIterator.splits((train, val, test),
                                                              device = device,
                                                              sort_within_batch = True,
                                                              sort_key = lambda x: len(x.context)
sort_within_batch = True, One batch The data in the will press sort_key In descending order ,
sort_key Is the rule of arrangement , Use here context The length of , That is, the number of words contained in each user comment .
                     #  model building 
import torch.nn as nn
class simple_LSTM(nn.Module):
    def __init__(self, hidden_size, embedding_dim, vocab_size, ):
        super(simple_LSTM, self).__init__() # Call the constructor of the parent class 
        self.embedding = nn.Embedding(vocab_size, embedding_dim) # vocab_size Vocabulary size , embedding_dim Word embedding dimension 
        self.encoder = nn.LSTM(input_size=embedding_dim, hidden_size = hidden_size, num_layers=1)
        self.predictor = nn.Linear(hidden_size,2) # Fully connected layer   Make a two category 
    def forward(self,seq): #seq  It's a comment 
        output,(hidden, cell) = self.encoder(self.embedding(seq)) # Embed the comment as a word 
        # output :  torch.Size([24, 32, 100])  24 How many words to comment on  ,32 yes batch_size 100hidden Size 
        # hidden :  torch.Size([1, 32, 100])
        # cell :  torch.Size([1, 32, 100])
        preds = self.predictor(hidden.squeeze(0)) # because hidden yes 1 32 100  We don't need to 1, Just get 100 Is the input of hidden layer , So the 0 Dimension removal of 

        return preds
# Create model objects 
lstm_model = simple_LSTM(hidden_size=100, embedding_dim=300, vocab_size=20002)
lstm_model.to(device) # Deploy to the running device 

# model training 
from torch import optim
# Optimizer 
optimizer = optim.Adam(lstm_model.parameters(),lr=0.001)
# Loss function 
criterion = nn.CrossEntropyLoss() # Many classification , ( negative , Neutral , positive )

loss_list = [] # preservation loss
accuracy_list = [] # preservation accuracy
iteration_list = [] # Number of save cycles 

def train_val_test(model, optimizer, criterion, train_iter, val_iter, test_iter, epochs):
    for epoch in range(1,epochs+1):
        train_loss = 0.0 # Loss of training 
        val_loss = 0.0 # Verify the loss 
        model.train() # Declare to start training 
        for indices ,batch in enumerate(train_iter):
            # Gradient set 0
            outputs = model(batch.context) #  Forecast output output
            # batch.label
            loss = criterion(outputs,batch.label) # Calculate the loss 
            loss.backward() # Back propagation 
            optimizer.step() # Update parameters 
            # batch.tweet shape :  torch.Size([26, 32]) --> 26: Sequence length , 32: One batch_size Size 
            train_loss += loss.data.item() * batch.context.size(0) #  Accumulated loss value of each batch 
        train_loss /= len(train_iter)  #  Calculate the average loss  len(train_iter) :  40000

        print("Epoch:{},Train Loss:{:.2f} ".format(epoch,train_loss))
        model.eval() #  Declare model validation 
        for indices, batch in enumerate(val_iter):
            context = batch.context.to(device) # Deploy to device On 
            target = batch.label.to(device)
            pred = model(context) # Model to predict 
            loss = criterion(pred,target)
            val_loss /= loss.item() * context.size(0) # Accumulated loss value of each batch 
        val_loss /= len(val_iter) # Calculate the average loss 
        print("Epoch:{},Val Loss:{:.2f} ".format(epoch, val_loss))

        model.eval() # Statement 
        correct = 0.0 #  Calculate the accuracy 
        test_loss = 0.0 #  Test loss 
        for idx, batch in enumerate(test_iter):
            context = batch.context.to(device) # Deploy to device On 
            target = batch.label.to(device)
            outputs = model(context)  #  Output 
            loss = criterion(outputs, target)  #  Calculate the loss 
            test_loss /= loss.item() * context.size(0) # Accumulated loss value of each batch 
            #  Get the maximum predictive value index 
            preds = outputs.argmax(1)
            #  Cumulative correct number 
            correct += preds.eq(target.view_as(preds)).sum().item()
        test_loss /= len(test_iter) # Calculate the average loss 
        #  preservation accuracy, loss iteration
        print("Epoch : {}, Test Loss : {:.2f}".format(epoch, test_loss))
        print("Accuracy : {}".format(100 * correct / (len(test_iter) * batch.context.size(1))))
    #  visualization  loss
    plt.plot(iteration_list, loss_list)
    plt.xlabel('Number of Iteration')

    #  visualization  accuracy
    plt.plot(iteration_list, accuracy_list, color='r')
    plt.xlabel('Number of Iteration')
#  Start training and verification 
train_val_test(lstm_model,  optimizer, criterion, train_iter, val_iter, test_iter, epochs=10)


本文为[How about a song without trace]所创,转载请带上原文链接,感谢