当前位置：网站首页>Chartextcnn (Ag dataset - news topic classification)

Chartextcnn (Ag dataset - news topic classification)

2022-06-12 06:07:00 【Singing under the hedge】

List of articles

CharTextCNN
One 、 File directory
Two 、 Corpus download address （ This article chooses AG）
3、 ... and 、 Data processing (data_loader.py)
Four 、 Model （chartextcnn.py）
5、 ... and 、 Training and testing
experimental result

CharTextCNN

Insert picture description here

One 、 File directory

Insert picture description here

Two 、 Corpus download address （ This article chooses AG）

AG News: https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz
DBPedia: https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz
Sogou news: https://s3.amazonaws.com/fast-ai-nlp/sogou_news_csv.tgz
Yelp Review Polarity: https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
Yelp Review Full: https://s3.amazonaws.com/fast-ai-nlp/yelp_review_full_csv.tgz
Yahoo! Answers: https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz
Amazon Review Full: https://s3.amazonaws.com/fast-ai-nlp/amazon_review_full_csv.tgz
Amazon Review Polarity: https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz

3、 ... and 、 Data processing (data_loader.py)

1. Dataset loading
2. Read labels and data
3. Read all characters
4. Sentence ont-hot Express

import os
import torch
import json
import csv
import numpy as np
from torch.utils import data

class AG_Data(data.DataLoader):
    def __init__(self,data_path,l0=1014):
        self.path = os.path.abspath('.')
        if "data" not in self.path:
            self.path +="/data"
        self.data_path = data_path
        self.l0 = l0
        self.load_Alphabet()
        self.load(self.data_path)
    #  Read all characters 
    def load_Alphabet(self):
        with open(self.path+"/alphabet.json") as f:
            self.alphabet = "".join(json.load(f))

    #  Download data , Read labels and data 
    def load(self, data_path,lowercase=True):
        self.label = []
        self.data = []
        #  Dataset loading 
        with open(self.path+data_path,"r") as f:
            #  The default read / write is separated by commas (delimiter), Double quotation marks are used as references (quotechar)
            datas = list(csv.reader(f, delimiter=',', quotechar='"'))
            for row in datas:
                self.label.append(int(row[0]) - 1)
                txt = " ".join(row[1:])
                if lowercase:
                    txt = txt.lower()
                self.data.append(txt)
        self.y = self.label
    #  The sentence one-hot Express ,X:batch_size* character one-hot Express （feature)* The number of characters in the sentence (length=1014),Y: label 
    def __getitem__(self, idx):
        X = self.oneHotEncode(idx)
        y = self.y[idx]
        return X, y
    def oneHotEncode(self, idx):
        X = np.zeros([len(self.alphabet),self.l0])
        for index_char, char in enumerate(self.data[idx][::-1]):
            if self.char2Index(char) != -1:
                X[self.char2Index(char)][index_char] = 1.0
        return X

    #  Returns the subscript of a character , The character has an output subscript , No output present -1.
    def char2Index(self,char):
        return self.alphabet.find(char)

    #  Read label length 
    def __len__(self):
        return len(self.label)

Four 、 Model （chartextcnn.py）

Insert picture description here
Normalize before each layer ：

import torch
import torch.nn as nn
import numpy as np
class CharTextCNN(nn.Module):
    def __init__(self,config):
        super(CharTextCNN,self).__init__()
        in_features = [config.char_num] + config.features[0:-1]
        out_features = config.features
        kernel_sizes = config.kernel_sizes
        self.convs = []
        # bs*70*1014
        self.conv1 = nn.Sequential(
                    nn.Conv1d(in_features[0], out_features[0], kernel_size=kernel_sizes[0], stride=1), #  One dimensional convolution 
                    nn.BatchNorm1d(out_features[0]), # bn layer 
                    nn.ReLU(), # relu Activate the function layer 
                    nn.MaxPool1d(kernel_size=3, stride=3) # One dimensional pool layer 
                ) #  Convolution +bn+relu+pooling modular 
        self.conv2  = nn.Sequential(
            nn.Conv1d(in_features[1], out_features[1], kernel_size=kernel_sizes[1], stride=1),
            nn.BatchNorm1d(out_features[1]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_features[2], out_features[2], kernel_size=kernel_sizes[2], stride=1),
            nn.BatchNorm1d(out_features[2]),
            nn.ReLU()
        )
        self.conv4 = nn.Sequential(
            nn.Conv1d(in_features[3], out_features[3], kernel_size=kernel_sizes[3], stride=1),
            nn.BatchNorm1d(out_features[3]),
            nn.ReLU()
        )
        self.conv5 = nn.Sequential(
            nn.Conv1d(in_features[4], out_features[4], kernel_size=kernel_sizes[4], stride=1),
            nn.BatchNorm1d(out_features[4]),
            nn.ReLU()
        )
        self.conv6 = nn.Sequential(
            nn.Conv1d(in_features[5], out_features[5], kernel_size=kernel_sizes[5], stride=1),
            nn.BatchNorm1d(out_features[5]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.fc1 = nn.Sequential(
            nn.Linear(8704, 1024), #  Fully connected layer  #((l0-96)/27)*256
            nn.ReLU(),
            nn.Dropout(p=config.dropout) # dropout layer 
        ) #  Full connection +relu+dropout modular 

        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=config.dropout)
        )

        self.fc3 = nn.Linear(1024, config.num_classes)
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)

        x = x.view(x.size(0), -1) #  Become two-dimensional and send to the full connection layer 
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

5、 ... and 、 Training and testing

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from model import CharTextCNN
from data import AG_Data
from tqdm import tqdm
import numpy as np
import config as argumentparser
config = argumentparser.ArgumentParser() #  Read in parameter settings 
config.features = list(map(int,config.features.split(","))) #  take features use , Division , And turn it into int
config.kernel_sizes = list(map(int,config.kernel_sizes.split(","))) # kernel_sizes, Division , And turn it into int
config.pooling = list(map(int,config.pooling.split(",")))

if config.gpu and torch.cuda.is_available():  #  Whether to use gpu
    torch.cuda.set_device(config.gpu)
#  Import training set 
training_set = AG_Data(data_path="/AG/train.csv",l0=config.l0)
training_iter = torch.utils.data.DataLoader(dataset=training_set,
                                            batch_size=config.batch_size,
                                            shuffle=True,
                                            num_workers=0)
#  Import test set 
test_set = AG_Data(data_path="/AG/test.csv",l0=config.l0)

test_iter = torch.utils.data.DataLoader(dataset=test_set,
                                        batch_size=config.batch_size,
                                        shuffle=False,
                                        num_workers=0)
model = CharTextCNN(config) #  Initialize model 
if config.cuda and torch.cuda.is_available(): #  If you use gpu, Send the model to gpu
    model.cuda()
criterion = nn.CrossEntropyLoss() #  structure loss structure 
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) # Build optimizer 
loss  = -1
def get_test_result(data_iter,data_set):
    #  Generate test results 
    model.eval()
    data_loss = 0
    true_sample_num = 0
    for data, label in data_iter:
        if config.cuda and torch.cuda.is_available():
            data = data.cuda()
            label = label.cuda()
        else:
            data = torch.autograd.Variable(data).float()
        out = model(data)
        true_sample_num += np.sum((torch.argmax(out, 1) == label).cpu().numpy()) #  Get one batch The number of samples with correct prediction 
    acc = true_sample_num / data_set.__len__()
    return data_loss,acc


for epoch in range(config.epoch):
    model.train()
    process_bar = tqdm(training_iter)
    for data, label in process_bar:
        if config.cuda and torch.cuda.is_available():
            data = data.cuda()  #  If you use gpu, Send data to gou
            label = label.cuda()
        else:
            data = torch.autograd.Variable(data).float()
        label = torch.autograd.Variable(label).squeeze()
        out = model(data)
        loss_now = criterion(out, autograd.Variable(label.long()))
        if loss == -1:
            loss = loss_now.data.item()
        else:
            loss = 0.95 * loss + 0.05 * loss_now.data.item()  #  Smooth operation 
        process_bar.set_postfix(loss=loss_now.data.item())  #  Output loss, Real-time monitoring loss Size 
        process_bar.update()
        optimizer.zero_grad()  #  Gradient update 
        loss_now.backward()
        optimizer.step()
    test_loss, test_acc = get_test_result(test_iter, test_set)
    print("The test acc is: %.5f" % test_acc)