当前位置：网站首页>Data loading and preprocessing

Data loading and preprocessing

2022-07-01 04:46:00 【CyrusMay】

Pytorch（ Four ） —— Data preprocessing

1. Use torch.utils.data.Dataset Read the data
2. Use torch.utils.data.DataLoader Load data
3. Use torchvision.datasets.ImageFolder For fast data reading

1. Use torch.utils.data.Dataset Read the data

Read data by inheriting this class
The file path is ：

import torch
from torch.utils.data import Dataset,DataLoader
import os
import csv
import glob
import random
from PIL import Image
from torchvision import transforms
import visdom
from torchvision.datasets import ImageFolder

class AnimalData(Dataset):
    def __init__(self,root,resize = [28,28],mode="train"):
        super(AnimalData,self).__init__()
        self.root = root
        self.resize = resize # [h,w]

        #  Get the labels of each category according to the name of the subfolder 
        self.class2label = {
    }
        for name in sorted(os.listdir(os.path.join(self.root))):
            if not os.path.isdir(os.path.join(self.root,name)):
                continue
            self.class2label[name] = len(self.class2label.keys())
        print(self.class2label)

        #  from csv The storage path and label of the loaded data in the file 
        images,labels = self.load_csv("animal.csv")
        #  According to the requirements of the task , Return the data 
        if mode == "train":
            self.images = images[:int(0.6*len(images))]
            self.labels = labels[:int(0.6*len(images))]
        elif mode == "val":
            self.images = images[int(0.6 * len(images)):int(0.8 * len(images))]
            self.labels = labels[int(0.6 * len(images)):int(0.8 * len(images))]
        elif mode == "test":
            self.images = images[int(0.8 * len(images)):]
            self.labels = labels[int(0.8 * len(images)):]

    def load_csv(self,file_name):

        if not os.path.exists(file_name):
            images = []
            for name in self.class2label.keys():
                # glob.glob() Method can match the files in this path , Return to the full path 
                images += glob.glob(os.path.join(self.root,name,"*.png"))
                images += glob.glob(os.path.join(self.root,name,".jpg"))

            #  Scrambling data 
            random.shuffle(images)

            #  write in csv file , Easy to read next time 
            with open(file_name,"w",encoding="utf-8",newline="") as f:
                writer = csv.writer(f)
                for path in images:
                    name = path.split(os.sep)[1]
                    label = self.class2label[name]
                    writer.writerow([path,label])

        #  adopt csv Load data 
        with open(file_name,"r",encoding="utf-8") as f:
            reader = csv.reader(f)
            images = []
            labels = []
            for line in reader:
                images.append(line[0])
                labels.append(int(line[1]))
        return images,labels

    #  Override the method , Returns the data size 
    def __len__(self):
        return len(self.images)

    #  Anti standardization , Easy to visualize 
    def de_normalize(self,x_hat):
        mean = torch.tensor([0.485, 0.456, 0.406]).unsqueeze(1).unsqueeze(1)
        std = torch.tensor([0.229, 0.224, 0.225]).unsqueeze(1).unsqueeze(1)
        x = x_hat *std + mean
        return x

    #  Override the method , return Tensor Format data and labels 
    def __getitem__(self,idx):
        label = torch.tensor(self.labels[idx])
        tf = transforms.Compose([
             lambda x: Image.open(x).convert("RGB"), #  Read the picture 
             transforms.Resize([int(self.resize[0]*1.25),int(self.resize[1]*1.25)]),
             transforms.RandomRotation(15), #  Data to enhance 
             transforms.CenterCrop(self.resize), #  Centralized cutting 
             transforms.ToTensor(),
             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
         ])
        image = tf(self.images[idx])

        return image,label

if __name__ == '__main__':
    resize = [128,100]
    db = AnimalData(root="animal",resize=resize)

{'cat': 0, 'dog': 1, 'rabbit': 2}

2. Use torch.utils.data.DataLoader Load data

if __name__ == '__main__':

    resize = [128,100]
    db = AnimalData(root="animal",resize=resize)

    it_db = iter(db)
    vis = visdom.Visdom()
    image,label = next(it_db)
    vis.image(db.de_normalize(image),win="iter_image",opts=dict(title="iter_image"))

    #  Using a data loader , Set up batch
    loader = DataLoader(dataset=db,batch_size=16,shuffle=True,num_workers=8) # num_workers The parameter is multi thread reading data 
    for x,y in loader:
        vis.images(db.de_normalize(x),win="batch_imags",nrow=4,opts=dict(title="batch"))

Insert picture description here

3. Use torchvision.datasets.ImageFolder For fast data reading

  # ImageFolder  The above process can be realized in one step 
    tf = transforms.Compose([

        transforms.Resize([int(resize[0] * 1.25), int(resize[1] * 1.25)]),
        transforms.RandomRotation(15),  #  Data to enhance 
        transforms.CenterCrop(resize),  #  Centralized cutting 
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    db = ImageFolder(root = "animal",
                     transform=tf)

by CyrusMay 2022 06 30

How many twists and turns do you have in your life
To go to the other side of happiness
can Live this life without regret
Ordinary but not plain
—————— May day （ Qingkong future ）——————

原网站

版权声明
本文为[CyrusMay]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/182/202207010435183158.html