Обучение в Pycharm


import numpy as np
import gym
from gym import spaces


class GoLeftEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where the agent must learn to go always left.
    """
    # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {'render.modes': ['console']}
    # Define constants for clearer code
    ATTACK = 0
    ATTACK2 = 1
    ATTACK3 = 2

    def __init__(self, grid_size=100):
        super(GoLeftEnv, self).__init__()

        # Size of the 1D-grid
        self.grid_size = grid_size
        # Initialize the agent at the right of the grid
        self.agent_pos = grid_size - 1
        self.ghost = 70
        self.firstpiratehealth = 10
        self.secondpiratehealth = 10
        self.thirdpiratehealth = 10
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions, we have two: left and right
        n_actions = 3
        self.action_space = spaces.Discrete(n_actions)
        # The observation will be the coordinate of the agent
        # this can be described both by Discrete and Box space
        self.observation_space = spaces.Box(low=-100, high=100,
                                            shape=(4,), dtype=np.float32)

    def reset(self):
        """
        Important: the observation must be a numpy array
        :return: (np.array)
        """
        # Initialize the agent at the right of the grid
        # self.agent_pos = self.grid_size - 1
        self.ghost = 64
        self.firstpiratehealth = 10
        self.secondpiratehealth = 10
        self.thirdpiratehealth = 10
        # here we convert to float32 to make it more general (in case we want to use continuous actions)
        return np.array([self.ghost, self.firstpiratehealth, self.secondpiratehealth, self.thirdpiratehealth]).astype(
            np.float32)

    def step(self, action):
        if action == self.ATTACK:
            # self.agent_pos -= 1
            self.firstpiratehealth = self.firstpiratehealth - 2
        elif action == self.ATTACK2:
            self.secondpiratehealth = self.secondpiratehealth - 2
        elif action == self.ATTACK3:
            self.thirdpiratehealth = self.thirdpiratehealth - 2
        # self.agent_pos += 1
        # self.ghost = self.ghost + 2
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))

        if self.firstpiratehealth > 0:
            self.ghost = self.ghost - 2
        else:
            self.ghost = self.ghost

        if self.secondpiratehealth > 0:
            self.ghost = self.ghost - 2
        else:
            self.ghost = self.ghost

        if self.thirdpiratehealth > 0:
            self.ghost = self.ghost - 2
        else:
            self.ghost = self.ghost

        done = bool((
                                self.secondpiratehealth <= 0 and self.firstpiratehealth <= 0 and self.thirdpiratehealth <= 0) or self.ghost <= 0)

        if (
                self.secondpiratehealth <= 0 and self.firstpiratehealth <= 0 and self.thirdpiratehealth <= 0 and self.ghost > 0):
            reward = 100
        else:
            reward = 0

        # Optionally we can pass additional info, we are not using that for now
        info = {}

        return np.array([self.ghost, self.firstpiratehealth, self.secondpiratehealth, self.thirdpiratehealth]).astype(
            np.float32), reward, done, info

    def render(self, mode='console'):
        print("Здоровье призрака - ")
        print(self.ghost)
        print("Здоровье первого пирата - ")
        print(self.firstpiratehealth)
        print("Здоровье второго пирата - ")
        print(self.secondpiratehealth)
        print("Здоровье третьего пирата - ")
        print(self.thirdpiratehealth)
        if mode != 'console':
            raise NotImplementedError()

    def close(self):
        pass

from stable_baselines3 import DQN, A2C, HER, TD3
from stable_baselines3.common.env_util import make_vec_env

# Instantiate the env
env = GoLeftEnv(grid_size=100)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

# Train the agent (запуск обучения, рекоммендую не менее 50к шагов)
model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=200000)

obs = env.reset()
n_steps = 40
for step in range(n_steps):

    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    env.render(mode='console')
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break