-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathlearning6-value.py
128 lines (98 loc) · 3.92 KB
/
learning6-value.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Predicted Value Learning
# Here, we learn to predict future rewards. The idea is to learn the
# function V(s) where V(s) = R(t) + 0.9*R(t+1) + 0.9**2*R(t+2) + ...
# You can do this by substitution, getting V(s,t) = R(t) + 0.9*V(s+1, t+1).
# This suggests an error rule: Error(t) = V(t-1) - R(t-1) - 0.9*V(t)
# We convert this to a continuous domain by using a long time constant for t-1
# and a short time constant for t.
# In the model below, the agent ALWAYS MOVES RANDOMLY. It is not *using* what
# it learns to change its movement. The goal here is simply to anticipate
# future rewards. Connecting this learning to the action selection system
# as in the previous example is left as an exercise for the reader.
# The agent is given a reward whenever it is in the green square, and a
# punishment (negative reward) whenever it is in the red square. After some
# time, it should learn to start anticipating this reward/punishment as
# shown in the value graph. In particular, the value graph should drop to a
# low value when it turns right at the top of the T-maze, and it should
# go to a high value when it turns left at the top of the T-maze.
import nengo
import numpy as np
# requires CCMSuite https://github.com/tcstewar/ccmsuite/
import ccm.lib.grid
import ccm.lib.continuous
import ccm.ui.nengo
mymap="""
#######
# #
# # # #
# # # #
#G B#
#######
"""
class Cell(ccm.lib.grid.Cell):
def color(self):
if self.wall:
return 'black'
elif self.reward > 0:
return 'green'
elif self.reward < 0:
return 'red'
return None
def load(self, char):
if char == '#':
self.wall = True
self.reward = 0
if char == 'G':
self.reward = 10
elif char == 'B':
self.reward = -10
world = ccm.lib.cellular.World(Cell, map=mymap, directions=4)
body = ccm.lib.continuous.Body()
world.add(body, x=1, y=2, dir=2)
def move(t, x):
speed, rotation = x
dt = 0.001
max_speed = 20.0
max_rotate = 10.0
body.turn(rotation * dt * max_rotate)
body.go_forward(speed * dt * max_speed)
if int(body.x) == 1:
world.grid[4][4].wall = True
world.grid[4][2].wall = False
if int(body.x) == 4:
world.grid[4][2].wall = True
world.grid[4][4].wall = False
def detect(t):
angles = (np.linspace(-0.5, 0.5, 3) + body.dir ) % world.directions
return [body.detect(d, max_distance=4)[0] for d in angles]
np.random.seed(1)
model = nengo.Network(seed=2)
with model:
movement = nengo.Node(move, size_in=2)
env = ccm.ui.nengo.GridNode(world, dt=0.005)
stim_radar = nengo.Node(detect)
radar = nengo.Ensemble(n_neurons=50, dimensions=3, radius=4, seed=2,
noise=nengo.processes.WhiteSignal(10, 0.1, rms=1))
nengo.Connection(stim_radar, radar)
def braiten(x):
turn = x[2] - x[0]
spd = x[1] - 0.5
return spd, turn
nengo.Connection(radar, movement, function=braiten)
def position_func(t):
return body.x / world.width * 2 - 1, 1 - body.y/world.height * 2, body.dir / world.directions
position = nengo.Node(position_func)
state = nengo.Ensemble(100, 3)
nengo.Connection(position, state, synapse=None)
reward = nengo.Node(lambda t: body.cell.reward)
tau=0.1
value = nengo.Ensemble(n_neurons=50, dimensions=1)
learn_conn = nengo.Connection(state, value, function=lambda x: 0,
learning_rule_type=nengo.PES(learning_rate=1e-4,
pre_tau=tau))
nengo.Connection(reward, learn_conn.learning_rule,
transform=-1, synapse=tau)
nengo.Connection(value, learn_conn.learning_rule,
transform=-0.9, synapse=0.01)
nengo.Connection(value, learn_conn.learning_rule,
transform=1, synapse=tau)