-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepsilon_greedy.py
111 lines (86 loc) · 2.38 KB
/
epsilon_greedy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Very simple Reinforced learning program for "Multi-armed bandit".
"""
import numpy as np
import matplotlib.pyplot as plt
from loguru import logger
def play_bandit(a, variance=1.0):
"""
Draw a bandit arm.
:param a: Action to be taken.
:param variance: Variance.
:return: Reward value.
"""
return np.random.normal(a, scale=np.sqrt(variance))
def explore(q):
"""
Explore action. Draw another arm than exploit action.
:param q: Current estimate.
:return: Action value.
"""
return np.random.randint(len(q))
def exploit(q):
"""
Exploit action (greedy action). Draw arm with highest q.
:param q: Current estimate.
:return: Action value.
"""
greedy_actions, = np.where(q == np.max(q))
return np.random.choice(greedy_actions)
def act(q, epsilon=0.1):
"""
Decide when to explore or when to exploit. Exploit randomly.
:param q: Current estimate.
:param epsilon: Probability of explore action.
:return: Value of either action.
"""
if np.random.random() > epsilon:
return exploit(q)
else:
return explore(q)
def update_q(old_estimate, target, k):
"""
Update running estimate.
:param old_estimate: Previous level.
:param target: New information.
:param k: Counter.
:return: New estimate.
"""
step_size = 1./(k + 1)
error = target - old_estimate
return old_estimate + step_size * error
def simulate_agent(n=20, T=2500, epsilon=0.01, variance=1.0):
"""
Agent for taking actions either "explore" or "exploit".
:param n: Maximum reward level.
:param T: Iterations.
:param epsilon: Probability of explore action.
:param variance: Variance.
:return: Estimation, actions taken, rewards.
"""
q = np.zeros(n)
actions = np.zeros(T)
rewards = np.zeros(T)
for t in range(T):
a = act(q, epsilon)
reward = play_bandit(a, variance)
q[a] = update_q(q[a], reward, 1)
actions[t] = a
rewards[t] = reward
return q, actions, rewards
def main():
q, actions, rewards = simulate_agent()
# Plot actions
plt.plot(actions)
plt.xlabel('$t$')
plt.ylabel('$a_t$')
plt.show()
# Plot rewards
plt.plot(rewards)
plt.xlabel('$t$')
plt.ylabel('$r_t$')
plt.show()
if __name__ == '__main__':
logger.info('Started.')
main()
logger.info('Ended.')