sparse-Q-learning/main.py at master · amaurylekens/sparse-Q-learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import argparse
import json
import sys
from multiprocessing import Process, Manager, Lock
from statistics import mean
from typing import List, Tuple

import matplotlib.pyplot as plt
import numpy as np

from agent import Agent
from game import Game
from generate_game_rules import generate_game_rules
from rules import Rules


class Main(object):

    def __init__(self):
        parser = argparse.ArgumentParser(
            description='Sparse q-learning',
            usage='''main <command> [<args>]

The commands are:
   learn     Let the agents learn a policy during n episodes
   play      Play the game with a learned policy
   test      Test the performance of the learning
''')
        parser.add_argument('mode', help='mode to run')

        # check the mode
        args = parser.parse_args(sys.argv[1:2])
        if not hasattr(self, args.mode):
            print('Unrecognized mode')
            parser.print_help()
            exit(1)
        # invoke method with same name
        getattr(self, args.mode)()

    def learn(self):
        # manage the arguments
        parser = argparse.ArgumentParser(
            description='Let the agents learn a policy during n episodes')
        parser.add_argument('directory',
                            help="directory to store the rules file")
        parser.add_argument('-e', help="number of episode", default=100000, type=int)
        parser.add_argument('-g', help="grid size", default=4, type=int)

        args = parser.parse_args(sys.argv[2:])
        print('Running learn mode, episode={}, grid={}'.format(args.e, args.g))

        # run the learn mode with the arguments
        n_episode = args.e
        grid = (args.g, args.g)
        directory = args.directory
        learn_mode(n_episode, grid, directory)

    def play(self):
        # manage the arguments
        parser = argparse.ArgumentParser(
            description='Play the game with a learned policy')
        parser.add_argument('directory', help="directory of the rules file")
        parser.add_argument('name', help="name of the rules file")
        parser.add_argument('-g', help="grid size", default=4, type=int)

        args = parser.parse_args(sys.argv[2:])
        print('Running play mode, grid={}, directory={} name={}'.format(args.g, args.directory, args.name))

        # run the play mode with the arguments
        grid = (args.g, args.g)

        play_mode(grid, args.directory, args.name)

    def test(self):
        parser = argparse.ArgumentParser(
            description='Test the performance of the learning')
        parser.add_argument('-e', help="number of episode", default=100000, type=int)
        parser.add_argument('-r', help="number of run", default=25, type=int)
        parser.add_argument('-g', help="grid size", default=4, type=int)
        parser.add_argument('-v', '--verbose', action='store_true')

        args = parser.parse_args(sys.argv[2:])
        print('Running test mode, grid={}, run={}, episode={}'.format(args.g, args.r, args.e))

        # run the test mode with the arguments
        grid = (args.g, args.g)
        n_episode = args.e
        n_run = args.r
        verbose = args.verbose
        test_mode(n_episode, n_run, grid, verbose)


def learn_mode(n_episode, grid, directory):
    nrow, ncol = grid
    # create a game
    game = Game(nrow, ncol)

    # create a specific context graph/rules
    rules = generate_game_rules(ncol)

    # create predators
    predators = [Agent(0, rules), Agent(1, rules)]

    # run n episodes
    run_episodes(n_episode, game, rules, predators)

    ncol, nrow = grid
    file_name = "{}_{}_grid".format(ncol, nrow)
    rules.save_rules(directory=directory, name=file_name)


def play_mode(grid, directory, file_name):
    ncol, nrow = grid

    # create a game
    game = Game(ncol, nrow)

    # create a specific context graph and load rules
    rules = Rules()
    rules.load_rules(directory=directory, name=file_name)

    # create predators
    predators = [Agent(0, rules), Agent(1, rules)]

    capture = False

    while not capture:
        state = game.state

        # compute the action of the predators
        j_action = dict()
        for predator in predators:
            j_action[predator.pred_id] = predator.get_action_choice(state, 0.)

        # play the actions and get the reward and the next state
        _, _, capture = game.play(j_action)

        # print grid
        game.print()

        choice = ""

        while choice != "s" and choice != "n":
            choice = input("n -> next episode, s -> stop : ")
            print(choice)

        if choice == "s":
            break


def test_mode(n_episode: int, n_run: int, grid: Tuple[int, int], verbose=False, size_interval: int = 500):
    def f_run(run_times, test_games, size_interval,
              n_episode, line_to_up, run, lock, verbose):

        # create a specific context graph/rules
        rules = generate_game_rules(grid[0])

        # create predator agents
        predators = [Agent(0, rules), Agent(1, rules)]

        # game used to run episodes
        learn_game = Game(grid[0], grid[1])

        n_interval = int(n_episode / size_interval)

        times = []  # store capture time for each tests in a run

        time = make_capture_test(predators, test_games)
        times.append(time)

        for i in range(n_interval):

            run_episodes(size_interval, learn_game, rules, predators)
            time = make_capture_test(predators, test_games)
            times.append(time)

            if verbose:
                lock.acquire()
                for line in range(line_to_up):
                    sys.stdout.write("\033[F")
                sys.stdout.write("\033[K")
                sys.stdout.write("run {} : {} %".format((run + 1), ((i + 1) / n_interval) * 100))
                for line in range(line_to_up):
                    print("\r")
                lock.release()

        run_times.append(times)

    # generate 100 random initial game states
    ncol, nrow = grid
    test_games = [Game(nrow, ncol) for _ in range(100)]

    with Manager() as manager:
        run_times = manager.list()  # <-- can be shared between processes.
        processes = []
        N_PROCESS = 8
        lock = Lock()
        for run in range(n_run):
            p = Process(target=f_run, args=(run_times, test_games,
                                            size_interval, n_episode,
                                            (n_run - run), run, lock, verbose))
            if verbose:
                print("run {} : {} %".format((run + 1), 0))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        # average the results over the runs
        avg = [float(sum(col)) / len(col) for col in zip(*run_times)]
        episode = np.arange(0, n_episode + size_interval, size_interval).tolist()

        plt.plot(episode, avg)
        plt.xlabel("learning episode")
        plt.ylabel("capture/episode")
        plt.title("Evolution of cooperation")
        plt.savefig('images/plots/{}_{}_grid.png'.format(nrow, ncol))

        data = {"avg": avg, "episode": episode}
        with open('json/{}_{}_grid.json'.format(nrow, ncol), 'w') as outfile:
            json.dump(data, outfile)


def run_episodes(n_episode: int, game: Game, rules: Rules, predators: List[Agent]):
    # learning parameters
    gamma = 0.9
    epsilon = 0.2
    alpha = 0.3

    for episode in range(n_episode):
        # reset game to a random initial state
        game.reset(random_state=True)

        capture = False
        while not capture:
            state = game.state

            # compute the action of the predators
            j_action = dict()
            for predator in predators:
                j_action[predator.pred_id] = predator.get_action_choice(state, epsilon)

            # play the actions and get the reward and the next state
            next_state, rewards, capture = game.play(j_action)

            q_values = {predator.pred_id: predator.q_value(state) for predator in predators}

            if not capture:
                future_rewards = {predator.pred_id: predator.q_value(next_state) for predator in predators}
            else:
                future_rewards = {predator.pred_id: 0 for predator in predators}

            rules.update_rule_values(state, j_action, rewards, q_values, future_rewards, alpha, gamma)


def make_capture_test(predators: List[Agent], test_games: List[Game]):
    capture_times = []
    for game in test_games:
        game.reset()
        capture = False
        while not capture:
            state = game.state

            # compute the action of the predators
            j_action = dict()
            for predator in predators:
                j_action[predator.pred_id] = predator.get_action_choice(state, 0.)

            # play the actions and get the reward and the next state
            _, _, capture = game.play(j_action)

        capture_times.append(game.round)

    mean_capture_time = mean(capture_times)

    return mean_capture_time


if __name__ == '__main__':
    Main()