Xiaowu/20220123 (#712)
* update * Update MRP-1.py * up * uo * uui * up * update * oo * ui * ui * Update MDP_FrozenLake_Optimal.py
До Ширина: | Высота: | Размер: 260 KiB После Ширина: | Высота: | Размер: 260 KiB |
До Ширина: | Высота: | Размер: 232 KiB После Ширина: | Высота: | Размер: 232 KiB |
До Ширина: | Высота: | Размер: 261 KiB После Ширина: | Высота: | Размер: 261 KiB |
До Ширина: | Высота: | Размер: 296 KiB После Ширина: | Высота: | Размер: 296 KiB |
До Ширина: | Высота: | Размер: 295 KiB После Ширина: | Высота: | Размер: 295 KiB |
До Ширина: | Высота: | Размер: 278 KiB После Ширина: | Высота: | Размер: 278 KiB |
До Ширина: | Высота: | Размер: 258 KiB После Ширина: | Высота: | Размер: 258 KiB |
До Ширина: | Высота: | Размер: 266 KiB После Ширина: | Высота: | Размер: 266 KiB |
До Ширина: | Высота: | Размер: 369 KiB После Ширина: | Высота: | Размер: 369 KiB |
До Ширина: | Высота: | Размер: 284 KiB После Ширина: | Высота: | Размер: 284 KiB |
До Ширина: | Высота: | Размер: 274 KiB После Ширина: | Высота: | Размер: 274 KiB |
До Ширина: | Высота: | Размер: 283 KiB После Ширина: | Высота: | Размер: 283 KiB |
До Ширина: | Высота: | Размер: 14 KiB После Ширина: | Высота: | Размер: 14 KiB |
До Ширина: | Высота: | Размер: 46 KiB После Ширина: | Высота: | Размер: 46 KiB |
До Ширина: | Высота: | Размер: 52 KiB После Ширина: | Высота: | Размер: 52 KiB |
До Ширина: | Высота: | Размер: 42 KiB После Ширина: | Высота: | Размер: 42 KiB |
До Ширина: | Высота: | Размер: 50 KiB После Ширина: | Высота: | Размер: 50 KiB |
До Ширина: | Высота: | Размер: 48 KiB После Ширина: | Высота: | Размер: 48 KiB |
До Ширина: | Высота: | Размер: 50 KiB После Ширина: | Высота: | Размер: 50 KiB |
До Ширина: | Высота: | Размер: 51 KiB После Ширина: | Высота: | Размер: 51 KiB |
До Ширина: | Высота: | Размер: 49 KiB После Ширина: | Высота: | Размер: 49 KiB |
До Ширина: | Высота: | Размер: 47 KiB После Ширина: | Высота: | Размер: 47 KiB |
До Ширина: | Высота: | Размер: 255 KiB После Ширина: | Высота: | Размер: 255 KiB |
|
@ -0,0 +1,299 @@
|
|||
|
||||
$$
|
||||
p(s'|s,a) = \Pr \{S_t=s'|S_{t-1}=s,A_{t-1}=a\}
|
||||
$$
|
||||
|
||||
$$
|
||||
\sum_{i=0}^n p(s_i'|s_j,a) = 1
|
||||
$$
|
||||
|
||||
$$
|
||||
p(s_1'|s_j,a) + p(s_2'|s_j,a) + p(s_3'|s_j,a) = 1
|
||||
$$
|
||||
|
||||
$$
|
||||
P =
|
||||
\begin{bmatrix}
|
||||
p(s_1|s_1) & p(s_1|s_2) & \cdots & p(s_1|s_n)
|
||||
\\
|
||||
p(s_2|s_1) & p(s_2|s_2) & \cdots & p(s_2|s_n)
|
||||
\\
|
||||
\vdots & \vdots & \ddots & \vdots
|
||||
\\
|
||||
p(s_n|s_1) & p(s_n|s_2) & \cdots & p(s_n|s_n)
|
||||
\end{bmatrix}
|
||||
$$
|
||||
|
||||
奖励函数
|
||||
|
||||
$$
|
||||
R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s,A_{t-1}=a ]
|
||||
$$
|
||||
|
||||
$$
|
||||
R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s ]
|
||||
$$
|
||||
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots + \gamma^{T-t-1} R_{T}
|
||||
\\
|
||||
&= \sum_{k=0}^{T} \gamma^k R_{t+k+1}, \ 0 \le \gamma \le 1
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \gamma^3 R_{t+4} + \cdots
|
||||
\\
|
||||
&= R_{t+1} + \gamma (R_{t+2} + \gamma R_{t+3} + \gamma^{2} R_{t+4}+\cdots)
|
||||
\\
|
||||
&=R_{t+1} + \gamma G_{t+1}
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
$$
|
||||
R_s = \mathbb{E} [R_{t+1} | S_t=s]
|
||||
$$
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
V(s) &= \mathbb{E} [G_t \ | \ S_t=s]
|
||||
\\
|
||||
&=\mathbb{E} [R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots \ | \ S_t=s]
|
||||
\\
|
||||
&=\mathbb{E} [R_{t+1} + \gamma G_{t+1} \ | \ S_t=s]
|
||||
\\
|
||||
&=\mathbb{E} [R_{t+1}] + \gamma \mathbb{E} [G_{t+1}|S_t=s]
|
||||
\\
|
||||
&=R_{t+1} + \gamma V(s_{t+1})
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
$$
|
||||
V(Class3) = 4.09
|
||||
\\
|
||||
\begin{aligned}
|
||||
X&=R_{Class3}+\gamma*[V(Pub)*P(S_{Class3}|S_{Pub})
|
||||
\\
|
||||
&+ V(A_{Pass})*P(S_{Class3}|S_{A_{Pass}})]
|
||||
\\
|
||||
&=(-2)+0.9*(1.93*0.4+10*0.6)=4.09
|
||||
\end{aligned}
|
||||
\\
|
||||
V(Class3) == X
|
||||
$$
|
||||
|
||||
$$
|
||||
V(s) = R_s + \gamma * \sum V(s') P(s|s')
|
||||
$$
|
||||
|
||||
$$
|
||||
V(s)=R_s + \gamma \sum_{s' \in S} Pss' \cdot V(s')
|
||||
$$
|
||||
|
||||
$$
|
||||
V(s)=R_s + \gamma * [p_1V(s'_1) + p_2V(s'_2) + p_3V(s'_3)]
|
||||
$$
|
||||
|
||||
矩阵形式
|
||||
|
||||
$$
|
||||
V = R + \gamma PV
|
||||
$$
|
||||
|
||||
$$
|
||||
\begin{bmatrix}
|
||||
V(1)
|
||||
\\
|
||||
V(2)
|
||||
\\
|
||||
\vdots
|
||||
\\
|
||||
V(n)
|
||||
\end{bmatrix}
|
||||
=\
|
||||
\begin{bmatrix}
|
||||
R_1
|
||||
\\
|
||||
R_2
|
||||
\\
|
||||
\vdots
|
||||
\\
|
||||
R_n
|
||||
\end{bmatrix}
|
||||
+\gamma
|
||||
\begin{bmatrix}
|
||||
P_{11} & P_{12} & \cdots & P_{1n}
|
||||
\\
|
||||
P_{21} & P_{22} & \cdots & P_{2n}
|
||||
\\
|
||||
\vdots & \vdots & \ddots & \vdots
|
||||
\\
|
||||
P_{n1} & P_{n2} & \cdots & P_{nn}
|
||||
\end{bmatrix}
|
||||
\begin{bmatrix}
|
||||
V(1)
|
||||
\\
|
||||
V(2)
|
||||
\\
|
||||
\vdots
|
||||
\\
|
||||
V(n)
|
||||
\end{bmatrix}
|
||||
$$
|
||||
|
||||
$$
|
||||
V - \gamma PV = R
|
||||
\\
|
||||
(1-\gamma {})V = R
|
||||
\\
|
||||
V = (I - \gamma P)^{-1} R
|
||||
$$
|
||||
|
||||
策略价值函数
|
||||
|
||||
$$
|
||||
v_{\pi}(s)=\mathbb {E}_{\pi} [ G_t |S_t=s]
|
||||
$$
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
v_{\pi}(s)&=\sum_{a \in A} \pi(a|s) q_\pi(s,a)
|
||||
\\
|
||||
&=\pi(a_1|s) q_{\pi}(s,a_1)+\pi(a_2|s) q_{\pi}(s,a_2)+\pi(a_3|s) q_{\pi}(s,a_3)
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
策略动作函数
|
||||
|
||||
$$
|
||||
q_{\pi}(s,a)=\mathbb E_{\pi} [G_t | S_t=s, A_t=a]
|
||||
$$
|
||||
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
q_{\pi}(s,a)&=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s')
|
||||
\\
|
||||
&= R_s^a + \gamma [P_1 v_{\pi}(s'_1)+P_2 v_{\pi}(s'_2)]
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
$$
|
||||
v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
|
||||
$$
|
||||
|
||||
$$
|
||||
q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} \sum_{a' \in A} \pi(a'|s') q_\pi(s',a')
|
||||
$$
|
||||
|
||||
$$
|
||||
\begin{aligned}
|
||||
V_1 &= \pi(A_{Play}|S_{V_1})*(R_{Play}+\gamma P_{11}V_1)+\pi(A_{Quit}|S_{V_2})*(R_{Quit}+\gamma P_{12}V_2)
|
||||
\\
|
||||
V_2 &= \pi(A_{Play}|S_{V_2})*(R_{Play}+\gamma P_{21}V_1)+\pi(A_{Study1}|S_{V_2})*(R_{Study1}+\gamma P_{23}V_3)
|
||||
\\
|
||||
V_3 &= \pi(Sleep|S_{V_3})*(R_{Sleep}+\gamma P_{30}V_0)+\pi(A_{Study2}|S_{V_3})*(R_{Study2}+\gamma P_{34}V_4)
|
||||
\\
|
||||
V_4 &= \pi(A_{Pass}|S_{V_4})*(R_{Pass}+\gamma P_{40}V_0)+\pi(A_{Pub}|S_{V_4})*(R_{Pub}+\gamma P_{42}V_2+\gamma P_{43}V_3+\gamma P_{44}V_4)
|
||||
\end{aligned}
|
||||
$$
|
||||
|
||||
根据公式
|
||||
|
||||
$$
|
||||
V_*(s) = \underset{a}{\max} [R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')]
|
||||
$$
|
||||
|
||||
$$
|
||||
\gamma=1
|
||||
\\
|
||||
V_{Rest}=V0=0
|
||||
\\
|
||||
V_{Game} = V1 = \max (-1+V1, 0+V2)
|
||||
\\
|
||||
V_{Class1}=V2=\max (-1+V1, -2+V3)
|
||||
\\
|
||||
V_{Class2}=V3 = \max (0 + V0, -2+V4)
|
||||
\\
|
||||
V_{Class3} = V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
|
||||
$$
|
||||
|
||||
解上述方程组
|
||||
由于
|
||||
$$
|
||||
x = \max (x+a, b)
|
||||
$$
|
||||
|
||||
其中 a,b为常数时,可以推论 $x=b$。所以
|
||||
|
||||
$$
|
||||
V1 = \max (-1+V1, 0+V2)=V2
|
||||
\\
|
||||
V2=\max (-1+V1, -2+V3)=\max (-1+V2, -2+V3)=V3-2=V1
|
||||
\\
|
||||
V3 = V1+2
|
||||
\\
|
||||
V3 = \max (0 + V0, -2+V4)=V4-2
|
||||
\\
|
||||
V4 = V3+2=V1+4
|
||||
\\
|
||||
V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
|
||||
$$
|
||||
|
||||
把所有的变量都换成V1
|
||||
$$
|
||||
V4 = \max (10, 1+0.2V1+0.4V1+0.4(V1+2))=\max (10, V1+1.8)=\max(10,V4-2.2)
|
||||
$$
|
||||
|
||||
所以
|
||||
$$
|
||||
V4=10
|
||||
\\
|
||||
V1=V4-4=6
|
||||
\\
|
||||
V2=V1=6
|
||||
\\
|
||||
V3=V1+2=8
|
||||
$$
|
||||
|
||||
根据
|
||||
|
||||
|
||||
$$
|
||||
v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
|
||||
$$
|
||||
|
||||
$$
|
||||
\gamma=1
|
||||
$$
|
||||
|
||||
$$
|
||||
V0=0
|
||||
\\
|
||||
V1=0.5(-1+1*1*V1)+0.5(0+1*1*V2)=0.5V1+0.5V2-0.5
|
||||
\\
|
||||
V2=0.5(-1+1*1*V1)+0.5(-2+1*1*V3)=0.5V1+0.5V3-1.5
|
||||
\\
|
||||
V3=0.5(0+1*1*0)+0.5(-2+1*1*V4)=0.5V4-1
|
||||
\\
|
||||
V4=0.5(10+1*1*0)+0.5(1+1*0.2*V2+1*0.4*V3+1*0.4*V4)=0.1V2+0.2V3+0.2V4+5.5
|
||||
$$
|
||||
|
||||
解方程组得到
|
||||
|
||||
$$
|
||||
V3=2.7
|
||||
\\
|
||||
V2=-1.3
|
||||
\\
|
||||
V1=-2.3
|
||||
\\
|
||||
V4=7.4
|
||||
$$
|
||||
|
||||
$$
|
||||
Q_*(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')
|
||||
$$
|
|
@ -0,0 +1,97 @@
|
|||
import numpy as np
|
||||
|
||||
def V_pi(States, Pi_sa, P_as, Rewards, gamma):
|
||||
num_state = len(States)
|
||||
V_curr = [0.0] * num_state
|
||||
V_next = [0.0] * num_state
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历所有状态 s
|
||||
for curr_state in States:
|
||||
v_curr_sum = 0
|
||||
# 获得 状态->动作 策略概率
|
||||
next_actions_prob = Pi_sa[curr_state.value]
|
||||
# 遍历每个策略概率
|
||||
for action_value, action_prob in enumerate(next_actions_prob):
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_prob = P_as[action_value]
|
||||
v_sum = 0
|
||||
# 遍历每个转移概率
|
||||
for state_value, state_prob in enumerate(next_states_prob):
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
v_sum += state_prob * V_next[state_value]
|
||||
#end for
|
||||
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
|
||||
v_curr_sum += action_prob * (Rewards[action_value] + gamma * v_sum)
|
||||
# end for
|
||||
|
||||
V_curr[curr_state.value] = v_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
||||
|
||||
|
||||
def Q_pi(Actions, Pi_sa, P_as, Rewards, gamma):
|
||||
num_action = len(Actions)
|
||||
Q_curr = [0.0] * num_action
|
||||
Q_next = [0.0] * num_action
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_curr_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_prob = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for state_value, state_prob in enumerate(next_states_prob):
|
||||
# 获得 状态->动作 策略概率
|
||||
next_actions_prob = Pi_sa[state_value]
|
||||
q_sum = 0
|
||||
# 遍历每个动作概率求和
|
||||
for action_value, action_prob in enumerate(next_actions_prob):
|
||||
# math: \sum_{a'} \pi(a'|s')*q_{\pi}(s',a')
|
||||
q_sum += action_prob * Q_next[action_value]
|
||||
#end for
|
||||
# math: \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
|
||||
q_curr_sum += state_prob * q_sum
|
||||
# end for
|
||||
# math: q_{\pi}(s,a)=R_s^a + \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
|
||||
Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(Q_next, Q_curr):
|
||||
break
|
||||
# 把 Q_curr 赋值给 Q_next
|
||||
Q_next = Q_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return Q_next
|
||||
|
||||
|
||||
def Q_pi_from_V_pi(Actions, P_as, Rewards, gamma, v_pi):
|
||||
num_action = len(Actions)
|
||||
Q = [0.0] * num_action
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for next_state_value, next_state_prob in enumerate(next_states_probs):
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
q_sum += next_state_prob * v_pi[next_state_value]
|
||||
# end for
|
||||
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
|
||||
Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
|
||||
#endfor
|
||||
return Q
|
|
@ -0,0 +1,106 @@
|
|||
import numpy as np
|
||||
|
||||
# state value function
|
||||
def V_star(States, Pi_sa, P_as, Rewards, gamma):
|
||||
num_state = len(States)
|
||||
V_curr = [0.0] * num_state
|
||||
V_next = [0.0] * num_state
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历所有状态 s
|
||||
for curr_state in States:
|
||||
list_v = []
|
||||
# 获得 状态->动作 策略概率
|
||||
next_actions_probs = Pi_sa[curr_state.value]
|
||||
# 遍历每个策略概率
|
||||
for action_value, action_prob in enumerate(next_actions_probs):
|
||||
if (action_prob > 0.0):
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = P_as[action_value]
|
||||
v_sum = 0
|
||||
# 遍历每个转移概率
|
||||
for state_value, state_prob in enumerate(next_states_probs):
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
v_sum += state_prob * V_next[state_value]
|
||||
#end for
|
||||
# math: \max [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
|
||||
list_v.append(Rewards[action_value] + gamma * v_sum)
|
||||
# end for
|
||||
if (len(list_v) > 0):
|
||||
V_curr[curr_state.value] = max(list_v)
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
||||
|
||||
# action value function
|
||||
def Q_star(Actions, Pi_sa, P_as, Rewards, gamma):
|
||||
num_action = print(len(Actions))
|
||||
Q_curr = [0.0] * num_action
|
||||
Q_next = [0.0] * num_action
|
||||
count = 0
|
||||
# 迭代
|
||||
while (count < 100):
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_curr_sum = 0
|
||||
if (curr_action == Actions.Sleep):
|
||||
continue
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for state_value, state_prob in enumerate(next_states_probs):
|
||||
# 获得 状态->动作 策略概率
|
||||
next_actions_probs = Pi_sa[state_value]
|
||||
list_q = []
|
||||
# 遍历每个动作概率求和
|
||||
for next_action_value, next_action_prob in enumerate(next_actions_probs):
|
||||
if (next_action_prob > 0.0):
|
||||
# math: q_{\pi}(s',a')
|
||||
list_q.append(Q_next[next_action_value])
|
||||
#end for
|
||||
# math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a')
|
||||
if (len(list_q) > 0):
|
||||
q_curr_sum += state_prob * max(list_q)
|
||||
# end for
|
||||
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
|
||||
Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(Q_next, Q_curr):
|
||||
break
|
||||
# 把 Q_curr 赋值给 Q_next
|
||||
Q_next = Q_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return Q_next
|
||||
|
||||
|
||||
# math: q_*(s,a) = R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_*(s')
|
||||
def Q_star_from_V_star(Actions, P_as, Rewards, gamma, v_star):
|
||||
num_action = print(len(Actions))
|
||||
Q = [0.0] * num_action
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_sum = 0
|
||||
if (curr_action == Actions.Sleep):
|
||||
continue
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for next_state_value, next_state_prob in enumerate(next_states_probs):
|
||||
# math: \sum_{a'} P_{ss'}^a v_{*}(s')
|
||||
q_sum += next_state_prob * v_star[next_state_value]
|
||||
# end for
|
||||
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
|
||||
Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
|
||||
#endfor
|
||||
return Q
|
|
@ -0,0 +1,88 @@
|
|||
import math
|
||||
import numpy as np
|
||||
import tqdm
|
||||
import multiprocessing as mp
|
||||
|
||||
def mc_single_process(
|
||||
Rewards, TransMatrix, States,
|
||||
start_state, end_states, episodes, gamma):
|
||||
num_state = len(Rewards)
|
||||
sum_gain = 0
|
||||
for episode in tqdm.trange(episodes):
|
||||
if (start_state in end_states):
|
||||
# 最后一个状态也可能有reward值
|
||||
return Rewards[start_state.value]
|
||||
curr_state_value = start_state.value
|
||||
gain = Rewards[curr_state_value]
|
||||
power = 1
|
||||
while (True):
|
||||
next_state_value = np.random.choice(
|
||||
num_state, p=TransMatrix[curr_state_value])
|
||||
r = Rewards[next_state_value]
|
||||
gain += math.pow(gamma, power) * r
|
||||
if (States(next_state_value) in end_states):
|
||||
# 到达终点,分幕结束
|
||||
break
|
||||
else:
|
||||
power += 1
|
||||
curr_state_value = next_state_value
|
||||
# end while
|
||||
sum_gain += gain
|
||||
# end for
|
||||
v = sum_gain / episodes
|
||||
return v
|
||||
|
||||
# 蒙特卡洛采样法
|
||||
def MonteCarol(Rewards, TransMatrix, States, end_states, gamma, episodes):
|
||||
pool = mp.Pool(processes=6)
|
||||
Vs = []
|
||||
results = []
|
||||
for start_state in States:
|
||||
results.append(pool.apply_async(mc_single_process,
|
||||
args=(Rewards, TransMatrix, States, start_state, end_states, episodes, gamma,)))
|
||||
pool.close()
|
||||
pool.join()
|
||||
for i in range(len(results)):
|
||||
v = results[i].get()
|
||||
Vs.append(v)
|
||||
|
||||
return Vs
|
||||
|
||||
# 矩阵法
|
||||
def Matrix(ds, gamma):
|
||||
num_state = ds.Matrix.shape[0]
|
||||
I = np.eye(num_state)
|
||||
tmp1 = I - gamma * ds.Matrix
|
||||
tmp2 = np.linalg.inv(tmp1)
|
||||
vs = np.dot(tmp2, ds.Rewards)
|
||||
|
||||
return vs
|
||||
|
||||
# 贝尔曼方程迭代
|
||||
def Bellman(States, TransMatrix, Rewards, gamma):
|
||||
num_states = len(Rewards)
|
||||
V_curr = [0.0] * num_states
|
||||
V_next = [0.0] * num_states
|
||||
count = 0
|
||||
while (count < 1000):
|
||||
# 遍历每一个 state 作为 start_state
|
||||
for start_state in States:
|
||||
# 得到转移概率
|
||||
next_states_probs = TransMatrix[start_state.value]
|
||||
v_sum = 0
|
||||
# 计算下一个状态的 转移概率*状态值 的 和 v
|
||||
for next_state_value, next_state_prob in enumerate(next_states_probs):
|
||||
# if (prob[next_state] > 0.0):
|
||||
v_sum += next_state_prob * V_next[next_state_value]
|
||||
# end for
|
||||
V_curr[start_state.value] = Rewards[start_state.value] + gamma * v_sum
|
||||
# end for
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
|
@ -0,0 +1,115 @@
|
|||
import numpy as np
|
||||
from enum import Enum
|
||||
|
||||
# 状态
|
||||
class States(Enum):
|
||||
Start = 0
|
||||
Safe1 = 1
|
||||
Hole2 = 2
|
||||
Safe3 = 3
|
||||
Safe4 = 4
|
||||
Safe5 = 5
|
||||
Safe6 = 6
|
||||
Safe7 = 7
|
||||
Hole8 = 8
|
||||
Safe9 = 9
|
||||
Hole10 = 10
|
||||
Safe11 = 11
|
||||
Safe12 = 12
|
||||
Safe13 = 13
|
||||
Safe14 = 14
|
||||
Goal15 = 15
|
||||
|
||||
# Reward
|
||||
Hole = -1
|
||||
Goal = 5
|
||||
|
||||
# 状态奖励
|
||||
Rewards = [0, 0, Hole, 0,
|
||||
0, 0, 0, 0,
|
||||
Hole, 0, Hole, 0,
|
||||
0, 0, 0, Goal]
|
||||
|
||||
Matrix = np.array(
|
||||
[
|
||||
[0.0, 1/2, 0.0, 0.0,
|
||||
1/2, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 0
|
||||
|
||||
[1/3, 0.0, 1/3, 0.0,
|
||||
0.0, 1/3, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 1
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 2
|
||||
|
||||
[0.0, 0.0, 1/2, 0.0,
|
||||
0.0, 0.0, 0.0, 1/2,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 3
|
||||
|
||||
[1/3, 0.0, 0.0, 0.0,
|
||||
0.0, 1/3, 0.0, 0.0,
|
||||
1/3, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 4
|
||||
|
||||
[0.0, 1/4, 0.0, 0.0,
|
||||
1/4, 0.0, 1/4, 0.0,
|
||||
0.0, 1/4, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 5
|
||||
|
||||
[0.0, 0.0, 1/4, 0.0,
|
||||
0.0, 1/4, 0.0, 1/4,
|
||||
0.0, 0.0, 1/4, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 6
|
||||
|
||||
[0.0, 0.0, 0.0, 1/3,
|
||||
0.0, 0.0, 1/3, 0.0,
|
||||
0.0, 0.0, 0.0, 1/3,
|
||||
0.0, 0.0, 0.0, 0.0], # 7
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 8
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 1/4, 0.0, 0.0,
|
||||
1/4, 0.0, 1/4, 0.0,
|
||||
0.0, 1/4, 0.0, 0.0], # 9
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 10
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 1/3,
|
||||
0.0, 0.0, 1/3, 0.0,
|
||||
0.0, 0.0, 0.0, 1/3], # 11
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
1/2, 0.0, 0.0, 0.0,
|
||||
0.0, 1/2, 0.0, 0.0], # 12
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 1/3, 0.0, 0.0,
|
||||
1/3, 0.0, 1/3, 0.0], # 13
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 1/3, 0.0,
|
||||
0.0, 1/3, 0.0, 1/3], # 14
|
||||
|
||||
[0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0,
|
||||
0.0, 0.0, 0.0, 0.0], # 15, end state, no transform
|
||||
]
|
||||
)
|
|
@ -0,0 +1,223 @@
|
|||
from sunau import AUDIO_FILE_ENCODING_ADPCM_G723_3
|
||||
import numpy as np
|
||||
from enum import Enum
|
||||
|
||||
# 状态
|
||||
class States(Enum):
|
||||
Start = 0
|
||||
Safe1 = 1
|
||||
Hole2 = 2
|
||||
Safe3 = 3
|
||||
Safe4 = 4
|
||||
Safe5 = 5
|
||||
Safe6 = 6
|
||||
Safe7 = 7
|
||||
Hole8 = 8
|
||||
Safe9 = 9
|
||||
Hole10 = 10
|
||||
Safe11 = 11
|
||||
Safe12 = 12
|
||||
Safe13 = 13
|
||||
Safe14 = 14
|
||||
Goal15 = 15
|
||||
|
||||
|
||||
# 动作 对于4x4的方格,有正反48个动作(再减去进入终点后不能返回的数量)
|
||||
class Actions(Enum):
|
||||
a0001=0x0001
|
||||
a0102=0x0102
|
||||
a0203=0x0203
|
||||
a0100=0x0100
|
||||
a0201=0x0201
|
||||
a0302=0x0302
|
||||
|
||||
a0004=0x0004
|
||||
a0400=0x0400
|
||||
a0105=0x0105
|
||||
a0501 = 0x0501
|
||||
a0206=0x0206
|
||||
a0602 = 0x0602
|
||||
a0307=0x0307
|
||||
a0703 = 0x0703
|
||||
|
||||
a0405 = 0x0405
|
||||
a0506 = 0x0506
|
||||
a0607 = 0x0607
|
||||
a0504 = 0x0504
|
||||
a0605 = 0x0605
|
||||
a0706 = 0x0706
|
||||
|
||||
a0408 = 0x0408
|
||||
a0804 = 0x0804
|
||||
a0509 = 0x0509
|
||||
a0905 = 0x0905
|
||||
a0610 = 0x0610
|
||||
a1006 = 0x1006
|
||||
a0711 = 0x0711
|
||||
a1107 = 0x1107
|
||||
|
||||
a0809 = 0x0809
|
||||
a0910 = 0x0910
|
||||
a1011 = 0x1011
|
||||
a1110 = 0x1110
|
||||
a1009 = 0x1009
|
||||
a0908 = 0x0908
|
||||
|
||||
a0812 = 0x0812
|
||||
a1208 = 0x1208
|
||||
a0913 = 0x0913
|
||||
a1309 = 0x1309
|
||||
a1014 = 0x1014
|
||||
a1410 = 0x1410
|
||||
a1115 = 0x1115
|
||||
a1511 = 0x1511
|
||||
|
||||
a1213 = 0x1213
|
||||
a1314 = 0x1314
|
||||
a1415 = 0x1415
|
||||
|
||||
a1312 = 0x1312
|
||||
a1413 = 0x1413
|
||||
a1514 = 0x1514
|
||||
|
||||
|
||||
# 向前走动作F时,
|
||||
# 到达前方s的概率是0.7,
|
||||
# 滑到左侧的概率是0.2,
|
||||
# 滑到左侧的概率是0.1,
|
||||
# 如果是边角,前方概率不变,越界时呆在原地
|
||||
Front = 0.7
|
||||
Left = 0.2
|
||||
Right = 0.1
|
||||
# Reward
|
||||
Hole = -1
|
||||
Goal = 5
|
||||
|
||||
# 数据在数组中的位置语义
|
||||
Action = 0
|
||||
ActionPi = 1
|
||||
Reward = 2
|
||||
StateProbs = 3
|
||||
|
||||
P=[
|
||||
[ # state 0: action, pi, reward, [state, prob]
|
||||
[0x0001, 1/2, 0, [[1, Front],[0, Left],[4, Right]]],
|
||||
[0x0004, 1/2, 0, [[4, Front],[1, Left],[0, Right]]]
|
||||
],
|
||||
[ # state 1: action, prob, reward, [state, prob]
|
||||
[0x0100, 1/3, 0, [[0, Front],[5, Left],[1, Right]]],
|
||||
[0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
|
||||
[0x0105, 1/3, 0, [[5, Front],[2, Left],[0, Right]]]
|
||||
],
|
||||
[ # state 2: action, prob, reward, [state, prob]
|
||||
#[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
|
||||
#[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
|
||||
#[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
|
||||
[0x0202, 1, Hole, [[2, 1]]]
|
||||
],
|
||||
[ # state 3: action, prob, reward, [state, prob]
|
||||
[0x0302, 1/2, Hole, [[2, Front],[7, Left],[3, Right]]],
|
||||
[0x0307, 1/2, 0, [[7, Front],[3, Left],[2, Right]]]
|
||||
],
|
||||
#############
|
||||
[ # state 4: action, prob, reward, [state, prob]
|
||||
[0x0400, 1/3, 0, [[0, Front],[4, Left],[5, Right]]],
|
||||
[0x0405, 1/3, 0, [[5, Front],[0, Left],[8, Right]]],
|
||||
[0x0408, 1/3, Hole, [[8, Front],[5, Left],[4, Right]]]
|
||||
],
|
||||
[ # state 5: action, prob, reward, [state, prob]
|
||||
[0x0501, 1/4, 0, [[1, Front],[4, Left],[6, Right]]],
|
||||
[0x0504, 1/4, 0, [[4, Front],[9, Left],[1, Right]]],
|
||||
[0x0506, 1/4, 0, [[6, Front],[1, Left],[9, Right]]],
|
||||
[0x0509, 1/4, 0, [[9, Front],[6, Left],[4, Right]]]
|
||||
],
|
||||
[ # state 6: action, prob, reward, [state, prob]
|
||||
[0x0602, 1/4, Hole, [[2, Front],[5, Left],[7, Right]]],
|
||||
[0x0605, 1/4, 0, [[5, Front],[10, Left],[2, Right]]],
|
||||
[0x0607, 1/4, 0, [[7, Front],[2, Left],[10, Right]]],
|
||||
[0x0610, 1/4, Hole, [[10, Front],[5, Left],[7, Right]]],
|
||||
],
|
||||
[ # state 7: action, prob, reward, [state, prob]
|
||||
[0x0703, 1/3, 0, [[3, Front],[6, Left],[7, Right]]],
|
||||
[0x0706, 1/3, 0, [[6, Front],[11, Left],[3, Right]]],
|
||||
[0x0711, 1/3, 0, [[11, Front],[7, Left],[6, Right]]]
|
||||
],
|
||||
################
|
||||
[ # state 8: action, prob, reward, [state, prob]
|
||||
#[0x0804, 1/3, 0, [[4, Front],[8, Left],[9, Right]]],
|
||||
#[0x0809, 1/3, 0, [[9, Front],[4, Left],[12, Right]]],
|
||||
#[0x0812, 1/3, 0, [[12, Front],[9, Left],[8, Right]]]
|
||||
[0x0808, 1, Hole, [[8, 1]]]
|
||||
],
|
||||
[ # state 9: action, prob, reward, [state, prob]
|
||||
[0x0905, 1/4, 0, [[5, Front],[8, Left],[10, Right]]],
|
||||
[0x0908, 1/4, Hole, [[8, Front],[13, Left],[5, Right]]],
|
||||
[0x0910, 1/4, Hole, [[10, Front],[5, Left],[13, Right]]],
|
||||
[0x0913, 1/4, 0, [[13, Front],[10, Left],[8, Right]]]
|
||||
],
|
||||
[ # state 10: action, prob, reward, [state, prob]
|
||||
#[0x1006, 1/4, 0, [[6, Front],[9, Left],[11, Right]]],
|
||||
#[0x1011, 1/4, 0, [[11, Front],[6, Left],[14, Right]]],
|
||||
#[0x1014, 1/4, 0, [[14, Front],[11, Left],[9, Right]]],
|
||||
#[0x1009, 1/4, 0, [[9, Front],[14, Left],[6, Right]]]
|
||||
[0x1010, 1, Hole, [[10, 1]]]
|
||||
],
|
||||
[ # state 11: action, prob, reward, [state, prob]
|
||||
[0x1107, 1/3, 0, [[7, Front],[10, Left],[11, Right]]],
|
||||
[0x1110, 1/3, Hole, [[10, Front],[15, Left],[7, Right]]],
|
||||
[0x1115, 1/3, 0, [[15, Front],[15, Left],[10, Right]]]
|
||||
],
|
||||
###########
|
||||
[ # state 12: action, prob, reward, [state, prob]
|
||||
[0x1208, 1/2, Hole, [[8, Front],[12, Left],[13, Right]]],
|
||||
[0x1213, 1/2, 0, [[13, Front],[8, Left],[12, Right]]]
|
||||
],
|
||||
[ # state 13: action, prob, reward, [state, prob]
|
||||
[0x1309, 1/3, 0, [[9, Front],[12, Left],[14, Right]]],
|
||||
[0x1312, 1/3, 0, [[12, Front],[13, Left],[9, Right]]],
|
||||
[0x1314, 1/3, 0, [[14, Front],[9, Left],[13, Right]]]
|
||||
],
|
||||
[ # state 14: action, prob, reward, [state, prob]
|
||||
[0x1410, 1/3, Hole, [[10, Front],[13, Left],[15, Right]]],
|
||||
[0x1413, 1/3, 0, [[13, Front],[14, Left],[10, Right]]],
|
||||
[0x1415, 1/3, Goal, [[15, Front],[10, Left],[14, Right]]]
|
||||
#[0x1414, 1, Goal, [[14, 1]]]
|
||||
],
|
||||
[ # state 15: action, prob, reward, [state, prob]
|
||||
#[0x1511, 1/2, 0, [[15, Front],[14, Left], [15, Right]]],
|
||||
#[0x1514, 1/2, 0, [[14, Front],[15, Left],[11, Right]]]
|
||||
[0x1515, 1, Goal, [[15, 1]]]
|
||||
]
|
||||
|
||||
]
|
||||
|
||||
class DataParser(object):
|
||||
def get_next_actions(self, curr_state):
|
||||
actions_data = P[curr_state]
|
||||
return actions_data
|
||||
|
||||
def get_action_pi_reward(self, action_data):
|
||||
return action_data[Action], action_data[ActionPi], action_data[Reward]
|
||||
|
||||
def get_action_states_probs(self, action_data):
|
||||
return action_data[StateProbs]
|
||||
|
||||
def get_next_states_probs(self, action):
|
||||
for state in P:
|
||||
for actions_data in state:
|
||||
if (actions_data[Action] == action):
|
||||
return actions_data[Reward], actions_data[StateProbs]
|
||||
return None, None
|
||||
|
||||
'''
|
||||
dataParser = DataParser()
|
||||
data = dataParser.get_next_actions(0)
|
||||
print(len(data))
|
||||
for i in range(len(data)):
|
||||
a,p,r = dataParser.get_action_pi_reward(data[i])
|
||||
print(a,p,r)
|
||||
sp = dataParser.get_action_states_probs(data[i])
|
||||
print(sp)
|
||||
'''
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
import numpy as np
|
||||
from enum import Enum
|
||||
|
||||
|
||||
# 状态
|
||||
class States(Enum):
|
||||
Class1 = 0
|
||||
Class2 = 1
|
||||
Class3 = 2
|
||||
Pass = 3
|
||||
Pub = 4
|
||||
Play = 5
|
||||
Sleep = 6
|
||||
|
||||
# 收益向量
|
||||
# [Class1, Class2, Class3, Pass, Pub, Play, Sleep]
|
||||
Rewards = [-2, -2, -2, 10, 1, -1, 0]
|
||||
|
||||
Matrix = np.array(
|
||||
[ #Cl1 Cl2 Cl3 Pas Pub Ply Slp
|
||||
[0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], # Class1
|
||||
[0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2], # CLass2
|
||||
[0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0], # Class3
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # Pass
|
||||
[0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0], # Pub
|
||||
[0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0], # Play
|
||||
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] # Sleep
|
||||
]
|
||||
)
|
|
@ -0,0 +1,59 @@
|
|||
from enum import Enum
|
||||
import numpy as np
|
||||
|
||||
# 状态
|
||||
class States(Enum):
|
||||
Rest = 0
|
||||
Game = 1
|
||||
Class1 = 2
|
||||
Class2 = 3
|
||||
Class3 = 4
|
||||
|
||||
# 动作
|
||||
class Actions(Enum):
|
||||
Quit = 0
|
||||
Play1 = 1
|
||||
Play2 = 2
|
||||
Study1 = 3
|
||||
Study2 = 4
|
||||
Pass = 5
|
||||
Pub = 6
|
||||
Sleep= 7
|
||||
|
||||
# 动作奖励
|
||||
Rewards = [0, -1, -1, -2, -2, 10, 1, 0]
|
||||
|
||||
# 状态->动作概率
|
||||
Pi_sa = np.array([
|
||||
# S_Rest -> A_none
|
||||
[0, 0, 0, 0, 0, 0, 0, 0],
|
||||
# S_Game -> A_Quit, A_Play1
|
||||
[0.5, 0.5, 0, 0, 0, 0, 0, 0],
|
||||
# S_Class1 -> A_Play2, A_Study1
|
||||
[0, 0, 0.5, 0.5, 0, 0, 0, 0],
|
||||
# S_Class2 -> A_Study2, A_Sleep
|
||||
[0, 0, 0, 0, 0.5, 0, 0, 0.5],
|
||||
# S_Class3 -> A_Pass, A_Pub
|
||||
[0, 0, 0, 0, 0, 0.5, 0.5, 0]
|
||||
])
|
||||
|
||||
# 动作->状态概率
|
||||
P_as = np.array([
|
||||
# A_Quit -> S_Class1
|
||||
[0, 0, 1, 0, 0],
|
||||
# A_Play1 -> S_Game
|
||||
[0, 1, 0, 0, 0],
|
||||
# A_Play2 -> S_Game
|
||||
[0, 1, 0, 0, 0],
|
||||
# A_Study1 -> S_Class2
|
||||
[0, 0, 0, 1, 0],
|
||||
# A_Study2 -> S_Class3
|
||||
[0, 0, 0, 0, 1],
|
||||
# A_Pass -> S_Rest
|
||||
[1, 0, 0, 0, 0],
|
||||
# A_Pub -> S_Class1, S_Class2, S_Class3
|
||||
[0, 0, 0.2, 0.4, 0.4],
|
||||
# A_Sleep -> S_None
|
||||
[0, 0, 0, 0, 0]
|
||||
])
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
import numpy as np
|
||||
from enum import Enum
|
||||
|
||||
# 状态
|
||||
class States(Enum):
|
||||
Goal0 = 0
|
||||
Safe1 = 1
|
||||
Hole2 = 2
|
||||
Safe3 = 3
|
||||
Safe4 = 4
|
||||
Safe5 = 5
|
||||
|
||||
|
||||
# 动作 对于4x4的方格,有正反48个动作(再减去进入终点后不能返回的数量)
|
||||
class Actions(Enum):
|
||||
a0001=0x0001
|
||||
a0102=0x0102
|
||||
a0100=0x0100
|
||||
a0201=0x0201
|
||||
|
||||
a0004=0x0004
|
||||
a0400=0x0400
|
||||
a0105=0x0105
|
||||
a0501 = 0x0501
|
||||
a0206=0x0206
|
||||
a0602 = 0x0602
|
||||
|
||||
a0405 = 0x0405
|
||||
a0506 = 0x0506
|
||||
a0504 = 0x0504
|
||||
a0605 = 0x0605
|
||||
|
||||
|
||||
# 向前走动作F时,
|
||||
# 到达前方s的概率是0.7,
|
||||
# 滑到左侧的概率是0.2,
|
||||
# 滑到左侧的概率是0.1,
|
||||
# 如果是边角,前方概率不变,越界时呆在原地
|
||||
Front = 0.7
|
||||
Left = 0.2
|
||||
Right = 0.1
|
||||
# Reward
|
||||
Hole = -1
|
||||
Goal = 5
|
||||
|
||||
# 数据在数组中的位置语义
|
||||
Action = 0
|
||||
ActionPi = 1
|
||||
Reward = 2
|
||||
StateProbs = 3
|
||||
|
||||
P=[
|
||||
[ # state 0: action, pi, reward, [state, prob]
|
||||
#[0x0000, 1, Goal, [[0, 1]]],
|
||||
],
|
||||
[ # state 1: action, prob, reward, [state, prob]
|
||||
[0x0100, 1/3, 0, [[0, Front],[4, Left],[1, Right]]],
|
||||
[0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
|
||||
[0x0104, 1/3, 0, [[4, Front],[2, Left],[0, Right]]]
|
||||
],
|
||||
[ # state 2: action, prob, reward, [state, prob]
|
||||
#[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
|
||||
#[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
|
||||
#[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
|
||||
#[0x0202, 1, Hole, [[2, 1]]]
|
||||
],
|
||||
|
||||
#############
|
||||
[ # state 3: action, prob, reward, [state, prob]
|
||||
[0x0300, 1/2, 0, [[0, Front],[3, Left],[4, Right]]],
|
||||
[0x0304, 1/2, 0, [[4, Front],[0, Left],[3, Right]]],
|
||||
],
|
||||
[ # state 4: action, prob, reward, [state, prob]
|
||||
[0x0401, 1/3, 0, [[1, Front],[3, Left],[5, Right]]],
|
||||
[0x0403, 1/3, 0, [[3, Front],[4, Left],[1, Right]]],
|
||||
[0x0405, 1/3, 0, [[5, Front],[1, Left],[4, Right]]],
|
||||
],
|
||||
[ # state 5: action, prob, reward, [state, prob]
|
||||
[0x0502, 1/2, Hole, [[2, Front],[4, Left],[5, Right]]],
|
||||
[0x0504, 1/2, 0, [[4, Front],[5, Left],[2, Right]]],
|
||||
],
|
||||
|
||||
]
|
||||
|
||||
class DataParser(object):
|
||||
def get_next_actions(self, curr_state):
|
||||
actions_data = P[curr_state.value]
|
||||
#print(actions_data)
|
||||
return actions_data
|
||||
|
||||
def get_action_pi_reward(self, action_data):
|
||||
return action_data[Action], action_data[ActionPi], action_data[Reward]
|
||||
|
||||
def get_action_states_probs(self, action_data):
|
||||
return action_data[StateProbs]
|
||||
|
||||
def get_next_states_probs(self, action):
|
||||
for state in P:
|
||||
for actions_data in state:
|
||||
if (actions_data[Action] == action):
|
||||
return actions_data[Reward], actions_data[StateProbs]
|
||||
return None, None
|
||||
|
||||
|
||||
|
||||
def V_pi(States, dataParser, gamma):
|
||||
num_state = 6
|
||||
V_curr = [0.0] * num_state
|
||||
V_next = [0.0] * num_state
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历所有状态 s
|
||||
for curr_state in States:
|
||||
v_curr_sum = 0
|
||||
# 获得 状态->动作 策略概率
|
||||
actions_data = dataParser.get_next_actions(curr_state)
|
||||
# 遍历每个策略概率
|
||||
for action_data in actions_data:
|
||||
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = dataParser.get_action_states_probs(action_data)
|
||||
#next_states_prob = P_as[action_value]
|
||||
v_sum = 0
|
||||
# 遍历每个转移概率
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
v_sum += next_state_prob * V_next[next_state_value]
|
||||
#end for
|
||||
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
|
||||
v_curr_sum += next_action_prob * (reward + gamma * v_sum)
|
||||
# end for
|
||||
V_curr[curr_state.value] = v_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next 迭代
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
||||
|
||||
def Q2_pi(Actions, dataParser, gamma, vs):
|
||||
Q = {}
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
|
||||
if (reward is None):
|
||||
continue
|
||||
#next_states_probs = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
q_sum += next_state_prob * vs[next_state_value]
|
||||
# end for
|
||||
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
|
||||
q = reward + gamma * q_sum
|
||||
Q[curr_action.name] = q
|
||||
#endfor
|
||||
return Q
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 0.9
|
||||
dataParser = DataParser()
|
||||
vs = V_pi(States, dataParser, gamma)
|
||||
print(np.round(np.array(vs).reshape(2,3), 2))
|
||||
Q = Q2_pi(Actions, dataParser, gamma, vs)
|
||||
for q in Q:
|
||||
print(q, "={:.4f}".format(Q[q]))
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
import Data_FrozenLake2 as dfl2
|
||||
import numpy as np
|
||||
|
||||
def V_star(States, dataParser, gamma):
|
||||
num_state = len(States)
|
||||
V_curr = [0.0] * num_state
|
||||
V_next = [0.0] * num_state
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历所有状态 s
|
||||
for curr_state in States:
|
||||
list_v = []
|
||||
# 获得 状态->动作 策略概率
|
||||
next_actions_datas = dataParser.get_next_actions(curr_state.value)
|
||||
for next_action_data in next_actions_datas:
|
||||
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(next_action_data)
|
||||
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = dataParser.get_action_states_probs(next_action_data)
|
||||
#next_states_prob = P_as[action_value]
|
||||
v_sum = 0
|
||||
# 遍历每个转移概率
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
v_sum += next_state_prob * V_next[next_state_value]
|
||||
#end for
|
||||
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
|
||||
list_v.append(reward + gamma * v_sum)
|
||||
# end for
|
||||
if (len(list_v) > 0):
|
||||
V_curr[curr_state.value] = max(list_v)
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
||||
|
||||
def Q_star(Actions, dataParser, gamma):
|
||||
num_action = len(Actions)
|
||||
Q_curr = [0.0] * num_action
|
||||
Q_next = [0.0] * num_action
|
||||
count = 0
|
||||
# 迭代
|
||||
while (count < 100):
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_curr_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
|
||||
if (reward is None):
|
||||
continue
|
||||
# 遍历每个转移概率求和
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# 获得 状态->动作 策略概率
|
||||
actions_datas = dataParser.get_next_actions(next_state_value)
|
||||
list_q = []
|
||||
# 求最大值
|
||||
for action_data in actions_datas:
|
||||
action, _, _ = dataParser.get_action_pi_reward(action_data)
|
||||
list_q.append(Q_next[action])
|
||||
#end for
|
||||
# math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a')
|
||||
if (len(list_q) > 0):
|
||||
q_curr_sum += next_state_prob * max(list_q)
|
||||
# end for
|
||||
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
|
||||
Q_curr[curr_action.value] = reward + gamma * q_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(Q_next, Q_curr):
|
||||
break
|
||||
# 把 Q_curr 赋值给 Q_next
|
||||
Q_next = Q_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return Q_next
|
||||
|
||||
|
||||
def Q_star_from_V_star(Actions, dataParser, gamma, v_star):
|
||||
Q_star = {}
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
|
||||
if (reward is None):
|
||||
continue
|
||||
# 遍历每个转移概率求和
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{a'} P_{ss'}^a v_{*}(s')
|
||||
q_sum += next_state_prob * v_star[next_state_value]
|
||||
# end for
|
||||
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
|
||||
Q_star[curr_action.name] = reward + gamma * q_sum
|
||||
#endfor
|
||||
return sorted(Q_star.items())
|
||||
|
||||
|
||||
|
||||
def find_next_best(Q, start):
|
||||
action = None
|
||||
value = None
|
||||
for q in Q:
|
||||
if (q[0].startswith(start)):
|
||||
if action is None:
|
||||
action = q[0]
|
||||
value = q[1]
|
||||
else:
|
||||
if (q[1] > value):
|
||||
action = q[0]
|
||||
value = q[1]
|
||||
return action, value
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 0.9
|
||||
dataParser = dfl2.DataParser()
|
||||
vs = V_star(dfl2.States, dataParser, gamma)
|
||||
print(np.round(np.array(vs).reshape(4,4), 2))
|
||||
|
||||
Q_star = Q_star_from_V_star(dfl2.Actions, dataParser, gamma, vs)
|
||||
for q in Q_star:
|
||||
print(q)
|
||||
|
||||
start = "a00"
|
||||
count = 0
|
||||
while(True):
|
||||
action, value = find_next_best(Q_star, start)
|
||||
print(action, value)
|
||||
if (action is None):
|
||||
break
|
||||
start = "a" + action.replace(start, "")
|
||||
count +=1
|
||||
if (count > 8):
|
||||
break
|
|
@ -0,0 +1,73 @@
|
|||
import Data_FrozenLake2 as dfl2
|
||||
|
||||
import numpy as np
|
||||
|
||||
def V_pi(States, dataParser, gamma):
|
||||
num_state = 16
|
||||
V_curr = [0.0] * num_state
|
||||
V_next = [0.0] * num_state
|
||||
count = 0
|
||||
# 迭代
|
||||
while (True):
|
||||
# 遍历所有状态 s
|
||||
for curr_state in States:
|
||||
v_curr_sum = 0
|
||||
# 获得 状态->动作 策略概率
|
||||
actions_data = dataParser.get_next_actions(curr_state)
|
||||
# 遍历每个策略概率
|
||||
for action_data in actions_data:
|
||||
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
|
||||
# 获得 动作->状态 转移概率
|
||||
next_states_probs = dataParser.get_action_states_probs(action_data)
|
||||
#next_states_prob = P_as[action_value]
|
||||
v_sum = 0
|
||||
# 遍历每个转移概率
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
v_sum += next_state_prob * V_next[next_state_value]
|
||||
#end for
|
||||
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
|
||||
v_curr_sum += next_action_prob * (reward + gamma * v_sum)
|
||||
# end for
|
||||
V_curr[curr_state.value] = v_curr_sum
|
||||
#endfor
|
||||
# 检查收敛性
|
||||
if np.allclose(V_next, V_curr):
|
||||
break
|
||||
# 把 V_curr 赋值给 V_next 迭代
|
||||
V_next = V_curr.copy()
|
||||
count += 1
|
||||
# end while
|
||||
print(count)
|
||||
return V_next
|
||||
|
||||
def Q2_pi(Actions, dataParser, gamma, vs):
|
||||
Q = {}
|
||||
# 遍历每个action
|
||||
for curr_action in Actions:
|
||||
q_sum = 0
|
||||
# 获得 动作->状态 转移概率
|
||||
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
|
||||
if (reward is None):
|
||||
continue
|
||||
#next_states_probs = P_as[curr_action.value]
|
||||
# 遍历每个转移概率求和
|
||||
for [next_state_value, next_state_prob] in next_states_probs:
|
||||
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
|
||||
q_sum += next_state_prob * vs[next_state_value]
|
||||
# end for
|
||||
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
|
||||
q = reward + gamma * q_sum
|
||||
Q[curr_action.name] = q
|
||||
#endfor
|
||||
return Q
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 0.9
|
||||
dataParser = dfl2.DataParser()
|
||||
vs = V_pi(dfl2.States, dataParser, gamma)
|
||||
print(np.round(np.array(vs).reshape(4,4), 2))
|
||||
Q = Q2_pi(dfl2.Actions, dataParser, gamma, vs)
|
||||
for q in Q:
|
||||
print(q, "={:.4f}".format(Q[q]))
|
|
@ -0,0 +1,24 @@
|
|||
import Algorithm_MDP_Star as algoMS
|
||||
import Data_Students2 as ds2
|
||||
|
||||
def Student_V_star(gamma):
|
||||
v = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
for start_state in ds2.States:
|
||||
print(start_state, "= {:.1f}".format(v[start_state.value]))
|
||||
|
||||
def Student_Q_star(gamma):
|
||||
v = algoMS.Q_star(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
for action in ds2.Actions:
|
||||
print(action, "= {:.1f}".format(v[action.value]))
|
||||
|
||||
def Student_Q_from_V_star(gamma):
|
||||
v_star = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
q_star = algoMS.Q_star_from_V_star(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_star)
|
||||
for action in ds2.Actions:
|
||||
print(action, "= {:.1f}".format(q_star[action.value]))
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 1
|
||||
Student_V_star(gamma)
|
||||
Student_Q_star(gamma)
|
||||
Student_Q_from_V_star(gamma)
|
|
@ -0,0 +1,25 @@
|
|||
import Data_Students2 as ds2
|
||||
import Algorithm_MDP as mba
|
||||
|
||||
def Student_V_Pi(gamma):
|
||||
v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
for state in ds2.States:
|
||||
print(state, "= {:.1f}".format(v_pi[state.value]))
|
||||
return v_pi
|
||||
|
||||
def Student_Q_Pi(gamma):
|
||||
q_pi = mba.Q_pi(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
for action in ds2.Actions:
|
||||
print(action, "= {:.1f}".format(q_pi[action.value]))
|
||||
|
||||
def Student_Q_Pi_From_V_Pi(gamma):
|
||||
v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
|
||||
q_pi = mba.Q_pi_from_V_pi(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_pi)
|
||||
for action in ds2.Actions:
|
||||
print(action, "= {:.1f}".format(q_pi[action.value]))
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 1
|
||||
Student_V_Pi(gamma)
|
||||
Student_Q_Pi(gamma)
|
||||
Student_Q_Pi_From_V_Pi(gamma)
|
|
@ -0,0 +1,27 @@
|
|||
import numpy as np
|
||||
import Algorithm_MPR as algoM
|
||||
import Data_FrozenLake as dfl
|
||||
|
||||
|
||||
def FrozenLake_MentoCarol(gamma):
|
||||
episodes = 20000
|
||||
end_states = [dfl.States.Hole2, dfl.States.Hole8, dfl.States.Hole10, dfl.States.Goal15]
|
||||
vs = algoM.MonteCarol(dfl.Rewards, dfl.Matrix, dfl.States, end_states, gamma, episodes)
|
||||
print(np.round(np.array(vs).reshape(4,4), 2))
|
||||
|
||||
def FrozenLake_Matrix(gamma):
|
||||
vs = algoM.Matrix(dfl, gamma)
|
||||
print(np.round(np.array(vs).reshape(4,4), 2))
|
||||
|
||||
def FrozenLake_Bellman(gamma):
|
||||
vs = algoM.Bellman(dfl.States, dfl.Matrix, dfl.Rewards, gamma)
|
||||
np.set_printoptions(suppress=True)
|
||||
print(np.round(np.array(vs).reshape(4,4), 2))
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 1
|
||||
print(gamma)
|
||||
#FrozenLake_MentoCarol(gamma)
|
||||
FrozenLake_Matrix(gamma)
|
||||
FrozenLake_Bellman(gamma)
|
|
@ -0,0 +1,30 @@
|
|||
import Data_Student as ds
|
||||
import Algorithm_MPR as algoM
|
||||
import numpy as np
|
||||
|
||||
|
||||
def Student_MonteCarol(gamma):
|
||||
episodes = 10000
|
||||
end_states = [ds.States.Sleep]
|
||||
v = algoM.MonteCarol(ds.Rewards, ds.Matrix, ds.States, end_states, gamma, episodes)
|
||||
for start_state in ds.States:
|
||||
print(start_state, "= {:.2f}".format(v[start_state.value]))
|
||||
|
||||
|
||||
def InvMatrix(gamma):
|
||||
v = algoM.Matrix(ds, gamma)
|
||||
for start_state in ds.States:
|
||||
print(start_state, "= {:.2f}".format(v[start_state.value]))
|
||||
return v
|
||||
|
||||
def Bellman(gamma):
|
||||
v = algoM.Bellman(ds.States, ds.Matrix, ds.Rewards, gamma)
|
||||
for start_state in ds.States:
|
||||
print(start_state, "= {:.2f}".format(v[start_state.value]))
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
gamma = 0.9
|
||||
#Student_MonteCarol(gamma)
|
||||
InvMatrix(gamma)
|
||||
Bellman(gamma)
|