当前位置：网站首页>Pytorch notes: td3

Pytorch notes: td3

2022-07-27 07:15:00 【UQI-LIUWJ】

Reference code source ：easy-rl/codes/TD3 at master · datawhalechina/easy-rl (github.com)

Theoretical part ： Strengthen learning notes ： Double delay determines the policy gradient (TD3)_UQI-LIUWJ The blog of -CSDN Blog

1 task1_train.py

1.1 Import library

import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path) 
sys.path.append(parent_path) # add current terminal path to sys.path

import torch
import gym
import numpy as np
import datetime

from agent import TD3
from utils import save_results,make_dir,plot_rewards

curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 
#  Get the current time

1.2 TD3Config

TD3 Some basic configuration of

class TD3Config:
	def __init__(self) -> None:
		self.algo = 'TD3'
		#  The algorithm name 
		self.env_name = 'Pendulum-v1'
		#  Name of the environment 
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		#  Whether to use GPU

		self.train_eps = 600
		#  Number of rounds of training 
		self.epsilon_start = 50
		# Before this step episode, Randomly choose actions ;
		# After this step episode, according to actor Choose action 
		self.eval_freq = 10
		#  How many rounds episode Output an intermediate result 

		self.max_timestep = 100000
		#  One episode How many rounds of maximum iteration 
		self.expl_noise = 0.1
		#exploration When , Standard deviation of Gaussian noise 
		self.batch_size = 256
		# actor and critic Of batch size

		self.gamma = 0.9
		#  The discount factor 
		self.lr=3e-4
		#  Learning rate 
		self.soft = 0.0005
		#  Soft update size 

		self.policy_noise = 0.2
		#  Updating actor When , Add a truncated normal distribution to the action , This policy_noise It's by N(0,1) The coefficient of 
		self.noise_clip = 0.3
		#  Updating actor When , Add a truncated normal distribution to the action , This noise_clip Is the truncation value 
		self.policy_freq = 2
		# Not every step is updated actor And the target network , Update them every few steps

1.3 PlotConfig

Some configurations of drawing path

class PlotConfig(TD3Config):
	def __init__(self) -> None:
		super().__init__()
		self.result_path = "./outputs/" + self.env_name + '/'+curr_time+'/results/'
		print(self.result_path)
		#  The path to save the results 
		self.model_path = "./outputs/" + self.env_name + '/'+curr_time+'/models/'
		#  The path to save the model 
		self.save = True
		#  Whether to save the picture

1.4 train

def train(cfg,env,agent):
	print(' Start training !')
	print(f' Environmental Science ：{cfg.env_name},  Algorithm ：{cfg.algo},  equipment ：{cfg.device}')
	
	rewards = []
	#  Record rewards for all rounds 
	ma_rewards = []
	#  Record the sliding average reward of all rounds 
	
	for i_ep in range(int(cfg.train_eps)):
		ep_reward = 0
		ep_timesteps = 0
		state, done = env.reset(), False
		while not done:
			ep_timesteps += 1
			
			if i_ep < cfg.epsilon_start:
				action = env.action_space.sample()
				# Randomly choose actions 
			else:
				action = (
					agent.choose_action(np.array(state))
					+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
				).clip(-max_action, max_action)
				# according to actor Choose action , Then add Gaussian noise , Finally, truncate （ Make sure the action is a feasible action ）
			'''
             In limine critic When you are not allowed , If used rashly critic To help actor Choose action , The effect may not be very good , So at first, choose randomly action, Come on train critic
             To a certain extent ,critic I've already trained a little , You can use actor To choose the action 
            '''	
			
			next_state, reward, done, _ = env.step(action)
			# Use this action Interact with the environment 
			
			done_bool = float(done) if ep_timesteps < env._max_episode_steps else 0
			# Whether it's a termination state （ Is there any follow-up state)
			# There is no follow-up in both cases state： Reach the largest episode Number / The environment returns to the termination state 
			
			agent.memory.push(state, action, next_state, reward, done_bool)
			# take transition Stored in experience playback 
			
			state = next_state
			ep_reward += reward
			
			# Train agent after collecting sufficient data
			if i_ep+1 >= cfg.epsilon_start:
				agent.update()
			# In between , Because the actions are randomly selected , and actor Has nothing to do with your decision , So don't update actor
				
		if (i_ep+1)%cfg.eval_freq == 0: 
			print(' round ：{}/{},  Reward ：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
			
		rewards.append(ep_reward)
		
		if ma_rewards:
			ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
		else:
			ma_rewards.append(ep_reward)
		# Moving average reward 
	print(' Finish training ！')	
	return rewards, ma_rewards

1.5 eval

def eval(env_name,agent, eval_episodes=10):
	eval_env = gym.make(env_name)
	rewards,ma_rewards =[],[]

	for i_episode in range(eval_episodes):
		ep_reward = 0
		state, done = eval_env.reset(), False
		while not done:
			# eval_env.render()
			action = agent.choose_action(np.array(state))
            # According to what I learned agent Of actor choice action
			state, reward, done, _ = eval_env.step(action)
			ep_reward += reward
		print(f"Episode:{i_episode+1}, Reward:{ep_reward:.3f}")
		rewards.append(ep_reward)
		#  Calculate the sliding window reward
		if ma_rewards:
			ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
		else:
			ma_rewards.append(ep_reward) 
	return rewards,ma_rewards

1.6 The main function

if __name__ == "__main__":
	cfg  = TD3Config()
	# To configure TD3 Related parameters 
	plot_cfg = PlotConfig()
	# Configure the path of drawing 
	env = gym.make(cfg.env_name)
	# Import environment 
	
	env.seed(1)
	torch.manual_seed(1)
	np.random.seed(1)
	#  Set random seeds 
	
	n_states = env.observation_space.shape[0]
	n_actions = env.action_space.shape[0]
	# Determine the number of States and actions according to the environment 
	
	max_action = float(env.action_space.high[0])
	# The value with the largest action space (2.0)
	
	agent = TD3(n_states,n_actions,max_action,cfg)
	# Initialize model 
	# The input dimension is 3, The output dimension is 1
	
	rewards,ma_rewards = train(cfg,env,agent)
	# Training TD3
	make_dir(plot_cfg.result_path,plot_cfg.model_path)
	# Create folders one by one 
	
	agent.save(path=plot_cfg.model_path)
	# preservation actor and critic The path of 
	
	save_results(rewards,ma_rewards,tag='train',path=plot_cfg.result_path)
	# preservation   Rewards and sliding rewards 
	plot_rewards(rewards,ma_rewards,plot_cfg,tag="train")
	# mapping 


    ############################### Test part ##########################################
    eval_agent=TD3(n_states,n_actions,max_action,cfg)
	eval_agent.load(path=plot_cfg.model_path)
	rewards,ma_rewards = eval(cfg.env_name,eval_agent)
	make_dir(plot_cfg.result_path_eval)
	save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path_eval)
	plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
    ################################################################################

2 agent.py

2.1 Import library

import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from TD3.memory import ReplayBuffer

2.2 Actor

class Actor(nn.Module):
	
	def __init__(self, input_dim, output_dim, max_action):
		'''[summary]

		Args:
			input_dim (int):  Input dimensions , This is equal to n_states
			output_dim (int):  Output dimension , This is equal to n_actions
			max_action (int): action The maximum of 
		'''		
		super(Actor, self).__init__()

		self.l1 = nn.Linear(input_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, output_dim)
		self.max_action = max_action
	
	def forward(self, state):
		
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return self.max_action * torch.tanh(self.l3(a))
        # after tanh after , return [-1,1] The content of the interval , And ride on max_action, It's what the environment needs action Range

2.3 Critic

class Critic(nn.Module):
	def __init__(self, input_dim, output_dim):
		super(Critic, self).__init__()

		# Q1 architecture
		self.l1 = nn.Linear(input_dim + output_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)

		# Q2 architecture
		self.l4 = nn.Linear(input_dim + output_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)

		'''
                 Is equivalent to TD3 The visual strategy network in the figure 1 And target strategy network 2 In a class Implemented in the 
        '''


	def forward(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)

		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		return q1, q2


	def Q1(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		return q1

2.4 TD3

class TD3(object):
	def __init__(
		self,
		input_dim,
		output_dim,
		max_action,
		cfg,
	):
		self.max_action = max_action
		# The value with the largest action space (2.0)

		self.gamma = cfg.gamma
		#  The discount factor 
		self.lr = cfg.lr
		#  Learning rate 
		self.soft=cfg.soft
		#  Soft update size 

		self.policy_noise = cfg.policy_noise
        #  Updating actor When , Add a truncated normal distribution to the action , This policy_noise It's by N(0,1) The coefficient of 
		self.noise_clip = cfg.noise_clip
        #  Updating actor When , Add a truncated normal distribution to the action , This noise_clip Is the truncation value 

		self.policy_freq = cfg.policy_freq
		# Not every step is updated actor, Update every few steps actor（ But every step is updated critic）

		self.batch_size =  cfg.batch_size
		#actor and critic Of batch size
		self.device = cfg.device
		# Equipment for training 

		self.total_it = 0
        # Counter of iterations （ Because it distinguishes which steps need to be updated actor And the objective function ）

		self.actor = Actor(input_dim, output_dim, max_action).to(self.device)
		self.actor_target = copy.deepcopy(self.actor)
		# Target policy network 
		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr)

		self.critic = Critic(input_dim, output_dim).to(self.device)
		self.critic_target = copy.deepcopy(self.critic)
		# Target value network （ There will be TD3 The two target value networks of ）
		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr)
		
		self.memory = ReplayBuffer(input_dim, output_dim)
		# Experience playback 

	def choose_action(self, state):
		state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
		#state[3]——>[1,3]
		return self.actor(state).cpu().data.numpy().flatten()
                #[1,3]——>[1]

	def update(self):
		self.total_it += 1

		# Sample replay buffer 
		state, action, next_state, reward, not_done = self.memory.sample(self.batch_size)

		with torch.no_grad():
			# Select action according to policy and add clipped noise
			noise = (
				torch.randn_like(action) * self.policy_noise
			).clamp(-self.noise_clip, self.noise_clip)
			# Truncated normal distribution 
			
			next_action = (
				self.actor_target(next_state) + noise
			).clamp(-self.max_action, self.max_action)
			# Plus truncated Gaussian distribution next_action
			# Output of the target policy network 

			# Compute the target Q value
			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
			# Output of two target Policy Networks 
			target_Q = torch.min(target_Q1, target_Q2)
			# Use the smaller one of the two target strategy network output as the next moment TD target
			target_Q = reward + not_done * self.gamma * target_Q
			# Depending on whether to terminate   Calculate the current time TD target

		
		current_Q1, current_Q2 = self.critic(state, action)
		# The output of two value networks 

		
		critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

		
		self.critic_optimizer.zero_grad()
		critic_loss.backward()
		self.critic_optimizer.step()
		# Synchronously update the parameters of the two value networks 

		
		if self.total_it % self.policy_freq == 0:
                        # Not every step is updated actor And the target network , Update them every few steps （ But every step is updated critic）

			
			actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
			# Calculation actor Of loss
			# according to TD3, Just value network 1 Output , So it's just Q1 Of value
			
			
			self.actor_optimizer.zero_grad()
			actor_loss.backward()
			self.actor_optimizer.step()
			# to update actor

			
			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
				target_param.data.copy_(self.soft * param.data + (1 - self.soft) * target_param.data)

			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
				target_param.data.copy_(self.soft * param.data + (1 - self.soft) * target_param.data)
			# Soft update target network 


	def save(self, path):
		torch.save(self.critic.state_dict(), path + "td3_critic")
		torch.save(self.critic_optimizer.state_dict(), path + "td3_critic_optimizer")
		
		torch.save(self.actor.state_dict(), path + "td3_actor")
		torch.save(self.actor_optimizer.state_dict(), path + "td3_actor_optimizer")


	def load(self, path):
		self.critic.load_state_dict(torch.load(path + "td3_critic"))
		self.critic_optimizer.load_state_dict(torch.load(path + "td3_critic_optimizer"))
		self.critic_target = copy.deepcopy(self.critic)

		self.actor.load_state_dict(torch.load(path + "td3_actor"))
		self.actor_optimizer.load_state_dict(torch.load(path + "td3_actor_optimizer"))
		self.actor_target = copy.deepcopy(self.actor)

3 memory.py

import numpy as np
import torch


class ReplayBuffer(object):
	def __init__(self, n_states, n_actions, max_size=int(1e6)):
		self.max_size = max_size
		# Experience the maximum size of playback 
		self.ptr = 0
		# Current pointer position （ next push Coordinates of incoming values ）
		self.size = 0
		# The size of the current experience playback 
		self.state = np.zeros((max_size, n_states))
		self.action = np.zeros((max_size, n_actions))
		self.next_state = np.zeros((max_size, n_states))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def push(self, state, action, next_state, reward, done):
                # take transition Put it into experience playback 
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done
		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)



	def sample(self, batch_size):
        # Random sampling batch_size individual transition
		ind = np.random.randint(0, self.size, size=batch_size)
		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)

4 utils.py

import os
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

4.1 plot_rewards

def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'):
    sns.set()
    plt.figure()  
    plt.title("learning curve on {} of {}".format(plot_cfg.device, 'TD3'))
    plt.xlabel('epsiodes')
    plt.plot(rewards, label='rewards')
    plt.plot(ma_rewards, label='ma rewards')
    # Draw the pictures of rewards and sliding rewards respectively 
    plt.legend()
    if plot_cfg.save:
        plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag))
    plt.show()

4.2 save_results

def save_results(rewards, ma_rewards, tag='train', path='./results'):
    '''  Save rewards 
    '''
    np.save(path+'{}_rewards.npy'.format(tag), rewards)
    np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards)
    print(' The results are saved ！')

4.3 make_dir

def make_dir(*paths):
    '''  Create folder 
    '''
    for path in paths:
        Path(path).mkdir(parents=True, exist_ok=True)

5 Output results

原网站

版权声明
本文为[UQI-LIUWJ]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/208/202207270543289458.html

当前位置：网站首页>Pytorch notes: td3

Pytorch notes: td3

1 task1_train.py

1.1 Import library

1.2 TD3Config

1.3 PlotConfig

1.4 train

1.5 eval

1.6 The main function

2 agent.py

2.1 Import library

2.2 Actor

2.3 Critic

2.4 TD3

3 memory.py

4 utils.py

4.1 plot_rewards

4.2 save_results

4.3 make_dir

5 Output results

边栏推荐

猜你喜欢

随机推荐