* update

* Update MRP-1.py

* up

* uo

* uui

* up

* update

* oo

* ui

* ui

* Update MDP_FrozenLake_Optimal.py
This commit is contained in:
xiaowuhu 2022-02-07 13:46:19 +08:00 коммит произвёл GitHub
Родитель 9dc715159d
Коммит 0c91801d6e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
56 изменённых файлов: 1513 добавлений и 0 удалений

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 260 KiB

После

Ширина:  |  Высота:  |  Размер: 260 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 232 KiB

После

Ширина:  |  Высота:  |  Размер: 232 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 261 KiB

После

Ширина:  |  Высота:  |  Размер: 261 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 296 KiB

После

Ширина:  |  Высота:  |  Размер: 296 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 295 KiB

После

Ширина:  |  Высота:  |  Размер: 295 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 278 KiB

После

Ширина:  |  Высота:  |  Размер: 278 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 258 KiB

После

Ширина:  |  Высота:  |  Размер: 258 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 266 KiB

После

Ширина:  |  Высота:  |  Размер: 266 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 369 KiB

После

Ширина:  |  Высота:  |  Размер: 369 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 284 KiB

После

Ширина:  |  Высота:  |  Размер: 284 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 274 KiB

После

Ширина:  |  Высота:  |  Размер: 274 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 283 KiB

После

Ширина:  |  Высота:  |  Размер: 283 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 14 KiB

После

Ширина:  |  Высота:  |  Размер: 14 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 46 KiB

После

Ширина:  |  Высота:  |  Размер: 46 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 52 KiB

После

Ширина:  |  Высота:  |  Размер: 52 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 42 KiB

После

Ширина:  |  Высота:  |  Размер: 42 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 50 KiB

После

Ширина:  |  Высота:  |  Размер: 50 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 48 KiB

После

Ширина:  |  Высота:  |  Размер: 48 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 50 KiB

После

Ширина:  |  Высота:  |  Размер: 50 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 51 KiB

После

Ширина:  |  Высота:  |  Размер: 51 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 49 KiB

После

Ширина:  |  Высота:  |  Размер: 49 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 47 KiB

После

Ширина:  |  Высота:  |  Размер: 47 KiB

Просмотреть файл

До

Ширина:  |  Высота:  |  Размер: 255 KiB

После

Ширина:  |  Высота:  |  Размер: 255 KiB

Просмотреть файл

@ -0,0 +1,299 @@
$$
p(s'|s,a) = \Pr \{S_t=s'|S_{t-1}=s,A_{t-1}=a\}
$$
$$
\sum_{i=0}^n p(s_i'|s_j,a) = 1
$$
$$
p(s_1'|s_j,a) + p(s_2'|s_j,a) + p(s_3'|s_j,a) = 1
$$
$$
P =
\begin{bmatrix}
p(s_1|s_1) & p(s_1|s_2) & \cdots & p(s_1|s_n)
\\
p(s_2|s_1) & p(s_2|s_2) & \cdots & p(s_2|s_n)
\\
\vdots & \vdots & \ddots & \vdots
\\
p(s_n|s_1) & p(s_n|s_2) & \cdots & p(s_n|s_n)
\end{bmatrix}
$$
奖励函数
$$
R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s,A_{t-1}=a ]
$$
$$
R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s ]
$$
$$
\begin{aligned}
G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots + \gamma^{T-t-1} R_{T}
\\
&= \sum_{k=0}^{T} \gamma^k R_{t+k+1}, \ 0 \le \gamma \le 1
\end{aligned}
$$
$$
\begin{aligned}
G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \gamma^3 R_{t+4} + \cdots
\\
&= R_{t+1} + \gamma (R_{t+2} + \gamma R_{t+3} + \gamma^{2} R_{t+4}+\cdots)
\\
&=R_{t+1} + \gamma G_{t+1}
\end{aligned}
$$
$$
R_s = \mathbb{E} [R_{t+1} | S_t=s]
$$
$$
\begin{aligned}
V(s) &= \mathbb{E} [G_t \ | \ S_t=s]
\\
&=\mathbb{E} [R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots \ | \ S_t=s]
\\
&=\mathbb{E} [R_{t+1} + \gamma G_{t+1} \ | \ S_t=s]
\\
&=\mathbb{E} [R_{t+1}] + \gamma \mathbb{E} [G_{t+1}|S_t=s]
\\
&=R_{t+1} + \gamma V(s_{t+1})
\end{aligned}
$$
$$
V(Class3) = 4.09
\\
\begin{aligned}
X&=R_{Class3}+\gamma*[V(Pub)*P(S_{Class3}|S_{Pub})
\\
&+ V(A_{Pass})*P(S_{Class3}|S_{A_{Pass}})]
\\
&=(-2)+0.9*(1.93*0.4+10*0.6)=4.09
\end{aligned}
\\
V(Class3) == X
$$
$$
V(s) = R_s + \gamma * \sum V(s') P(s|s')
$$
$$
V(s)=R_s + \gamma \sum_{s' \in S} Pss' \cdot V(s')
$$
$$
V(s)=R_s + \gamma * [p_1V(s'_1) + p_2V(s'_2) + p_3V(s'_3)]
$$
矩阵形式
$$
V = R + \gamma PV
$$
$$
\begin{bmatrix}
V(1)
\\
V(2)
\\
\vdots
\\
V(n)
\end{bmatrix}
=\
\begin{bmatrix}
R_1
\\
R_2
\\
\vdots
\\
R_n
\end{bmatrix}
+\gamma
\begin{bmatrix}
P_{11} & P_{12} & \cdots & P_{1n}
\\
P_{21} & P_{22} & \cdots & P_{2n}
\\
\vdots & \vdots & \ddots & \vdots
\\
P_{n1} & P_{n2} & \cdots & P_{nn}
\end{bmatrix}
\begin{bmatrix}
V(1)
\\
V(2)
\\
\vdots
\\
V(n)
\end{bmatrix}
$$
$$
V - \gamma PV = R
\\
(1-\gamma {})V = R
\\
V = (I - \gamma P)^{-1} R
$$
策略价值函数
$$
v_{\pi}(s)=\mathbb {E}_{\pi} [ G_t |S_t=s]
$$
$$
\begin{aligned}
v_{\pi}(s)&=\sum_{a \in A} \pi(a|s) q_\pi(s,a)
\\
&=\pi(a_1|s) q_{\pi}(s,a_1)+\pi(a_2|s) q_{\pi}(s,a_2)+\pi(a_3|s) q_{\pi}(s,a_3)
\end{aligned}
$$
策略动作函数
$$
q_{\pi}(s,a)=\mathbb E_{\pi} [G_t | S_t=s, A_t=a]
$$
$$
\begin{aligned}
q_{\pi}(s,a)&=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s')
\\
&= R_s^a + \gamma [P_1 v_{\pi}(s'_1)+P_2 v_{\pi}(s'_2)]
\end{aligned}
$$
$$
v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
$$
$$
q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} \sum_{a' \in A} \pi(a'|s') q_\pi(s',a')
$$
$$
\begin{aligned}
V_1 &= \pi(A_{Play}|S_{V_1})*(R_{Play}+\gamma P_{11}V_1)+\pi(A_{Quit}|S_{V_2})*(R_{Quit}+\gamma P_{12}V_2)
\\
V_2 &= \pi(A_{Play}|S_{V_2})*(R_{Play}+\gamma P_{21}V_1)+\pi(A_{Study1}|S_{V_2})*(R_{Study1}+\gamma P_{23}V_3)
\\
V_3 &= \pi(Sleep|S_{V_3})*(R_{Sleep}+\gamma P_{30}V_0)+\pi(A_{Study2}|S_{V_3})*(R_{Study2}+\gamma P_{34}V_4)
\\
V_4 &= \pi(A_{Pass}|S_{V_4})*(R_{Pass}+\gamma P_{40}V_0)+\pi(A_{Pub}|S_{V_4})*(R_{Pub}+\gamma P_{42}V_2+\gamma P_{43}V_3+\gamma P_{44}V_4)
\end{aligned}
$$
根据公式
$$
V_*(s) = \underset{a}{\max} [R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')]
$$
$$
\gamma=1
\\
V_{Rest}=V0=0
\\
V_{Game} = V1 = \max (-1+V1, 0+V2)
\\
V_{Class1}=V2=\max (-1+V1, -2+V3)
\\
V_{Class2}=V3 = \max (0 + V0, -2+V4)
\\
V_{Class3} = V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
$$
解上述方程组
由于
$$
x = \max (x+a, b)
$$
其中 a,b为常数时可以推论 $x=b$。所以
$$
V1 = \max (-1+V1, 0+V2)=V2
\\
V2=\max (-1+V1, -2+V3)=\max (-1+V2, -2+V3)=V3-2=V1
\\
V3 = V1+2
\\
V3 = \max (0 + V0, -2+V4)=V4-2
\\
V4 = V3+2=V1+4
\\
V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
$$
把所有的变量都换成V1
$$
V4 = \max (10, 1+0.2V1+0.4V1+0.4(V1+2))=\max (10, V1+1.8)=\max(10,V4-2.2)
$$
所以
$$
V4=10
\\
V1=V4-4=6
\\
V2=V1=6
\\
V3=V1+2=8
$$
根据
$$
v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
$$
$$
\gamma=1
$$
$$
V0=0
\\
V1=0.5(-1+1*1*V1)+0.5(0+1*1*V2)=0.5V1+0.5V2-0.5
\\
V2=0.5(-1+1*1*V1)+0.5(-2+1*1*V3)=0.5V1+0.5V3-1.5
\\
V3=0.5(0+1*1*0)+0.5(-2+1*1*V4)=0.5V4-1
\\
V4=0.5(10+1*1*0)+0.5(1+1*0.2*V2+1*0.4*V3+1*0.4*V4)=0.1V2+0.2V3+0.2V4+5.5
$$
解方程组得到
$$
V3=2.7
\\
V2=-1.3
\\
V1=-2.3
\\
V4=7.4
$$
$$
Q_*(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')
$$

Просмотреть файл

@ -0,0 +1,97 @@
import numpy as np
def V_pi(States, Pi_sa, P_as, Rewards, gamma):
num_state = len(States)
V_curr = [0.0] * num_state
V_next = [0.0] * num_state
count = 0
# 迭代
while (True):
# 遍历所有状态 s
for curr_state in States:
v_curr_sum = 0
# 获得 状态->动作 策略概率
next_actions_prob = Pi_sa[curr_state.value]
# 遍历每个策略概率
for action_value, action_prob in enumerate(next_actions_prob):
# 获得 动作->状态 转移概率
next_states_prob = P_as[action_value]
v_sum = 0
# 遍历每个转移概率
for state_value, state_prob in enumerate(next_states_prob):
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
v_sum += state_prob * V_next[state_value]
#end for
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
v_curr_sum += action_prob * (Rewards[action_value] + gamma * v_sum)
# end for
V_curr[curr_state.value] = v_curr_sum
#endfor
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next
def Q_pi(Actions, Pi_sa, P_as, Rewards, gamma):
num_action = len(Actions)
Q_curr = [0.0] * num_action
Q_next = [0.0] * num_action
count = 0
# 迭代
while (True):
# 遍历每个action
for curr_action in Actions:
q_curr_sum = 0
# 获得 动作->状态 转移概率
next_states_prob = P_as[curr_action.value]
# 遍历每个转移概率求和
for state_value, state_prob in enumerate(next_states_prob):
# 获得 状态->动作 策略概率
next_actions_prob = Pi_sa[state_value]
q_sum = 0
# 遍历每个动作概率求和
for action_value, action_prob in enumerate(next_actions_prob):
# math: \sum_{a'} \pi(a'|s')*q_{\pi}(s',a')
q_sum += action_prob * Q_next[action_value]
#end for
# math: \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
q_curr_sum += state_prob * q_sum
# end for
# math: q_{\pi}(s,a)=R_s^a + \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
#endfor
# 检查收敛性
if np.allclose(Q_next, Q_curr):
break
# 把 Q_curr 赋值给 Q_next
Q_next = Q_curr.copy()
count += 1
# end while
print(count)
return Q_next
def Q_pi_from_V_pi(Actions, P_as, Rewards, gamma, v_pi):
num_action = len(Actions)
Q = [0.0] * num_action
# 遍历每个action
for curr_action in Actions:
q_sum = 0
# 获得 动作->状态 转移概率
next_states_probs = P_as[curr_action.value]
# 遍历每个转移概率求和
for next_state_value, next_state_prob in enumerate(next_states_probs):
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
q_sum += next_state_prob * v_pi[next_state_value]
# end for
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
#endfor
return Q

Просмотреть файл

@ -0,0 +1,106 @@
import numpy as np
# state value function
def V_star(States, Pi_sa, P_as, Rewards, gamma):
num_state = len(States)
V_curr = [0.0] * num_state
V_next = [0.0] * num_state
count = 0
# 迭代
while (True):
# 遍历所有状态 s
for curr_state in States:
list_v = []
# 获得 状态->动作 策略概率
next_actions_probs = Pi_sa[curr_state.value]
# 遍历每个策略概率
for action_value, action_prob in enumerate(next_actions_probs):
if (action_prob > 0.0):
# 获得 动作->状态 转移概率
next_states_probs = P_as[action_value]
v_sum = 0
# 遍历每个转移概率
for state_value, state_prob in enumerate(next_states_probs):
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
v_sum += state_prob * V_next[state_value]
#end for
# math: \max [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
list_v.append(Rewards[action_value] + gamma * v_sum)
# end for
if (len(list_v) > 0):
V_curr[curr_state.value] = max(list_v)
#endfor
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next
# action value function
def Q_star(Actions, Pi_sa, P_as, Rewards, gamma):
num_action = print(len(Actions))
Q_curr = [0.0] * num_action
Q_next = [0.0] * num_action
count = 0
# 迭代
while (count < 100):
# 遍历每个action
for curr_action in Actions:
q_curr_sum = 0
if (curr_action == Actions.Sleep):
continue
# 获得 动作->状态 转移概率
next_states_probs = P_as[curr_action.value]
# 遍历每个转移概率求和
for state_value, state_prob in enumerate(next_states_probs):
# 获得 状态->动作 策略概率
next_actions_probs = Pi_sa[state_value]
list_q = []
# 遍历每个动作概率求和
for next_action_value, next_action_prob in enumerate(next_actions_probs):
if (next_action_prob > 0.0):
# math: q_{\pi}(s',a')
list_q.append(Q_next[next_action_value])
#end for
# math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a')
if (len(list_q) > 0):
q_curr_sum += state_prob * max(list_q)
# end for
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
#endfor
# 检查收敛性
if np.allclose(Q_next, Q_curr):
break
# 把 Q_curr 赋值给 Q_next
Q_next = Q_curr.copy()
count += 1
# end while
print(count)
return Q_next
# math: q_*(s,a) = R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_*(s')
def Q_star_from_V_star(Actions, P_as, Rewards, gamma, v_star):
num_action = print(len(Actions))
Q = [0.0] * num_action
# 遍历每个action
for curr_action in Actions:
q_sum = 0
if (curr_action == Actions.Sleep):
continue
# 获得 动作->状态 转移概率
next_states_probs = P_as[curr_action.value]
# 遍历每个转移概率求和
for next_state_value, next_state_prob in enumerate(next_states_probs):
# math: \sum_{a'} P_{ss'}^a v_{*}(s')
q_sum += next_state_prob * v_star[next_state_value]
# end for
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
#endfor
return Q

Просмотреть файл

@ -0,0 +1,88 @@
import math
import numpy as np
import tqdm
import multiprocessing as mp
def mc_single_process(
Rewards, TransMatrix, States,
start_state, end_states, episodes, gamma):
num_state = len(Rewards)
sum_gain = 0
for episode in tqdm.trange(episodes):
if (start_state in end_states):
# 最后一个状态也可能有reward值
return Rewards[start_state.value]
curr_state_value = start_state.value
gain = Rewards[curr_state_value]
power = 1
while (True):
next_state_value = np.random.choice(
num_state, p=TransMatrix[curr_state_value])
r = Rewards[next_state_value]
gain += math.pow(gamma, power) * r
if (States(next_state_value) in end_states):
# 到达终点,分幕结束
break
else:
power += 1
curr_state_value = next_state_value
# end while
sum_gain += gain
# end for
v = sum_gain / episodes
return v
# 蒙特卡洛采样法
def MonteCarol(Rewards, TransMatrix, States, end_states, gamma, episodes):
pool = mp.Pool(processes=6)
Vs = []
results = []
for start_state in States:
results.append(pool.apply_async(mc_single_process,
args=(Rewards, TransMatrix, States, start_state, end_states, episodes, gamma,)))
pool.close()
pool.join()
for i in range(len(results)):
v = results[i].get()
Vs.append(v)
return Vs
# 矩阵法
def Matrix(ds, gamma):
num_state = ds.Matrix.shape[0]
I = np.eye(num_state)
tmp1 = I - gamma * ds.Matrix
tmp2 = np.linalg.inv(tmp1)
vs = np.dot(tmp2, ds.Rewards)
return vs
# 贝尔曼方程迭代
def Bellman(States, TransMatrix, Rewards, gamma):
num_states = len(Rewards)
V_curr = [0.0] * num_states
V_next = [0.0] * num_states
count = 0
while (count < 1000):
# 遍历每一个 state 作为 start_state
for start_state in States:
# 得到转移概率
next_states_probs = TransMatrix[start_state.value]
v_sum = 0
# 计算下一个状态的 转移概率*状态值 的 和 v
for next_state_value, next_state_prob in enumerate(next_states_probs):
# if (prob[next_state] > 0.0):
v_sum += next_state_prob * V_next[next_state_value]
# end for
V_curr[start_state.value] = Rewards[start_state.value] + gamma * v_sum
# end for
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next

Просмотреть файл

@ -0,0 +1,115 @@
import numpy as np
from enum import Enum
# 状态
class States(Enum):
Start = 0
Safe1 = 1
Hole2 = 2
Safe3 = 3
Safe4 = 4
Safe5 = 5
Safe6 = 6
Safe7 = 7
Hole8 = 8
Safe9 = 9
Hole10 = 10
Safe11 = 11
Safe12 = 12
Safe13 = 13
Safe14 = 14
Goal15 = 15
# Reward
Hole = -1
Goal = 5
# 状态奖励
Rewards = [0, 0, Hole, 0,
0, 0, 0, 0,
Hole, 0, Hole, 0,
0, 0, 0, Goal]
Matrix = np.array(
[
[0.0, 1/2, 0.0, 0.0,
1/2, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 0
[1/3, 0.0, 1/3, 0.0,
0.0, 1/3, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 1
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 2
[0.0, 0.0, 1/2, 0.0,
0.0, 0.0, 0.0, 1/2,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 3
[1/3, 0.0, 0.0, 0.0,
0.0, 1/3, 0.0, 0.0,
1/3, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 4
[0.0, 1/4, 0.0, 0.0,
1/4, 0.0, 1/4, 0.0,
0.0, 1/4, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 5
[0.0, 0.0, 1/4, 0.0,
0.0, 1/4, 0.0, 1/4,
0.0, 0.0, 1/4, 0.0,
0.0, 0.0, 0.0, 0.0], # 6
[0.0, 0.0, 0.0, 1/3,
0.0, 0.0, 1/3, 0.0,
0.0, 0.0, 0.0, 1/3,
0.0, 0.0, 0.0, 0.0], # 7
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 8
[0.0, 0.0, 0.0, 0.0,
0.0, 1/4, 0.0, 0.0,
1/4, 0.0, 1/4, 0.0,
0.0, 1/4, 0.0, 0.0], # 9
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 10
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 1/3,
0.0, 0.0, 1/3, 0.0,
0.0, 0.0, 0.0, 1/3], # 11
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
1/2, 0.0, 0.0, 0.0,
0.0, 1/2, 0.0, 0.0], # 12
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 1/3, 0.0, 0.0,
1/3, 0.0, 1/3, 0.0], # 13
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 1/3, 0.0,
0.0, 1/3, 0.0, 1/3], # 14
[0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0], # 15, end state, no transform
]
)

Просмотреть файл

@ -0,0 +1,223 @@
from sunau import AUDIO_FILE_ENCODING_ADPCM_G723_3
import numpy as np
from enum import Enum
# 状态
class States(Enum):
Start = 0
Safe1 = 1
Hole2 = 2
Safe3 = 3
Safe4 = 4
Safe5 = 5
Safe6 = 6
Safe7 = 7
Hole8 = 8
Safe9 = 9
Hole10 = 10
Safe11 = 11
Safe12 = 12
Safe13 = 13
Safe14 = 14
Goal15 = 15
# 动作 对于4x4的方格有正反48个动作再减去进入终点后不能返回的数量
class Actions(Enum):
a0001=0x0001
a0102=0x0102
a0203=0x0203
a0100=0x0100
a0201=0x0201
a0302=0x0302
a0004=0x0004
a0400=0x0400
a0105=0x0105
a0501 = 0x0501
a0206=0x0206
a0602 = 0x0602
a0307=0x0307
a0703 = 0x0703
a0405 = 0x0405
a0506 = 0x0506
a0607 = 0x0607
a0504 = 0x0504
a0605 = 0x0605
a0706 = 0x0706
a0408 = 0x0408
a0804 = 0x0804
a0509 = 0x0509
a0905 = 0x0905
a0610 = 0x0610
a1006 = 0x1006
a0711 = 0x0711
a1107 = 0x1107
a0809 = 0x0809
a0910 = 0x0910
a1011 = 0x1011
a1110 = 0x1110
a1009 = 0x1009
a0908 = 0x0908
a0812 = 0x0812
a1208 = 0x1208
a0913 = 0x0913
a1309 = 0x1309
a1014 = 0x1014
a1410 = 0x1410
a1115 = 0x1115
a1511 = 0x1511
a1213 = 0x1213
a1314 = 0x1314
a1415 = 0x1415
a1312 = 0x1312
a1413 = 0x1413
a1514 = 0x1514
# 向前走动作F时
# 到达前方s的概率是0.7,
# 滑到左侧的概率是0.2,
# 滑到左侧的概率是0.1,
# 如果是边角,前方概率不变,越界时呆在原地
Front = 0.7
Left = 0.2
Right = 0.1
# Reward
Hole = -1
Goal = 5
# 数据在数组中的位置语义
Action = 0
ActionPi = 1
Reward = 2
StateProbs = 3
P=[
[ # state 0: action, pi, reward, [state, prob]
[0x0001, 1/2, 0, [[1, Front],[0, Left],[4, Right]]],
[0x0004, 1/2, 0, [[4, Front],[1, Left],[0, Right]]]
],
[ # state 1: action, prob, reward, [state, prob]
[0x0100, 1/3, 0, [[0, Front],[5, Left],[1, Right]]],
[0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
[0x0105, 1/3, 0, [[5, Front],[2, Left],[0, Right]]]
],
[ # state 2: action, prob, reward, [state, prob]
#[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
#[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
#[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
[0x0202, 1, Hole, [[2, 1]]]
],
[ # state 3: action, prob, reward, [state, prob]
[0x0302, 1/2, Hole, [[2, Front],[7, Left],[3, Right]]],
[0x0307, 1/2, 0, [[7, Front],[3, Left],[2, Right]]]
],
#############
[ # state 4: action, prob, reward, [state, prob]
[0x0400, 1/3, 0, [[0, Front],[4, Left],[5, Right]]],
[0x0405, 1/3, 0, [[5, Front],[0, Left],[8, Right]]],
[0x0408, 1/3, Hole, [[8, Front],[5, Left],[4, Right]]]
],
[ # state 5: action, prob, reward, [state, prob]
[0x0501, 1/4, 0, [[1, Front],[4, Left],[6, Right]]],
[0x0504, 1/4, 0, [[4, Front],[9, Left],[1, Right]]],
[0x0506, 1/4, 0, [[6, Front],[1, Left],[9, Right]]],
[0x0509, 1/4, 0, [[9, Front],[6, Left],[4, Right]]]
],
[ # state 6: action, prob, reward, [state, prob]
[0x0602, 1/4, Hole, [[2, Front],[5, Left],[7, Right]]],
[0x0605, 1/4, 0, [[5, Front],[10, Left],[2, Right]]],
[0x0607, 1/4, 0, [[7, Front],[2, Left],[10, Right]]],
[0x0610, 1/4, Hole, [[10, Front],[5, Left],[7, Right]]],
],
[ # state 7: action, prob, reward, [state, prob]
[0x0703, 1/3, 0, [[3, Front],[6, Left],[7, Right]]],
[0x0706, 1/3, 0, [[6, Front],[11, Left],[3, Right]]],
[0x0711, 1/3, 0, [[11, Front],[7, Left],[6, Right]]]
],
################
[ # state 8: action, prob, reward, [state, prob]
#[0x0804, 1/3, 0, [[4, Front],[8, Left],[9, Right]]],
#[0x0809, 1/3, 0, [[9, Front],[4, Left],[12, Right]]],
#[0x0812, 1/3, 0, [[12, Front],[9, Left],[8, Right]]]
[0x0808, 1, Hole, [[8, 1]]]
],
[ # state 9: action, prob, reward, [state, prob]
[0x0905, 1/4, 0, [[5, Front],[8, Left],[10, Right]]],
[0x0908, 1/4, Hole, [[8, Front],[13, Left],[5, Right]]],
[0x0910, 1/4, Hole, [[10, Front],[5, Left],[13, Right]]],
[0x0913, 1/4, 0, [[13, Front],[10, Left],[8, Right]]]
],
[ # state 10: action, prob, reward, [state, prob]
#[0x1006, 1/4, 0, [[6, Front],[9, Left],[11, Right]]],
#[0x1011, 1/4, 0, [[11, Front],[6, Left],[14, Right]]],
#[0x1014, 1/4, 0, [[14, Front],[11, Left],[9, Right]]],
#[0x1009, 1/4, 0, [[9, Front],[14, Left],[6, Right]]]
[0x1010, 1, Hole, [[10, 1]]]
],
[ # state 11: action, prob, reward, [state, prob]
[0x1107, 1/3, 0, [[7, Front],[10, Left],[11, Right]]],
[0x1110, 1/3, Hole, [[10, Front],[15, Left],[7, Right]]],
[0x1115, 1/3, 0, [[15, Front],[15, Left],[10, Right]]]
],
###########
[ # state 12: action, prob, reward, [state, prob]
[0x1208, 1/2, Hole, [[8, Front],[12, Left],[13, Right]]],
[0x1213, 1/2, 0, [[13, Front],[8, Left],[12, Right]]]
],
[ # state 13: action, prob, reward, [state, prob]
[0x1309, 1/3, 0, [[9, Front],[12, Left],[14, Right]]],
[0x1312, 1/3, 0, [[12, Front],[13, Left],[9, Right]]],
[0x1314, 1/3, 0, [[14, Front],[9, Left],[13, Right]]]
],
[ # state 14: action, prob, reward, [state, prob]
[0x1410, 1/3, Hole, [[10, Front],[13, Left],[15, Right]]],
[0x1413, 1/3, 0, [[13, Front],[14, Left],[10, Right]]],
[0x1415, 1/3, Goal, [[15, Front],[10, Left],[14, Right]]]
#[0x1414, 1, Goal, [[14, 1]]]
],
[ # state 15: action, prob, reward, [state, prob]
#[0x1511, 1/2, 0, [[15, Front],[14, Left], [15, Right]]],
#[0x1514, 1/2, 0, [[14, Front],[15, Left],[11, Right]]]
[0x1515, 1, Goal, [[15, 1]]]
]
]
class DataParser(object):
def get_next_actions(self, curr_state):
actions_data = P[curr_state]
return actions_data
def get_action_pi_reward(self, action_data):
return action_data[Action], action_data[ActionPi], action_data[Reward]
def get_action_states_probs(self, action_data):
return action_data[StateProbs]
def get_next_states_probs(self, action):
for state in P:
for actions_data in state:
if (actions_data[Action] == action):
return actions_data[Reward], actions_data[StateProbs]
return None, None
'''
dataParser = DataParser()
data = dataParser.get_next_actions(0)
print(len(data))
for i in range(len(data)):
a,p,r = dataParser.get_action_pi_reward(data[i])
print(a,p,r)
sp = dataParser.get_action_states_probs(data[i])
print(sp)
'''

Просмотреть файл

@ -0,0 +1,30 @@
import numpy as np
from enum import Enum
# 状态
class States(Enum):
Class1 = 0
Class2 = 1
Class3 = 2
Pass = 3
Pub = 4
Play = 5
Sleep = 6
# 收益向量
# [Class1, Class2, Class3, Pass, Pub, Play, Sleep]
Rewards = [-2, -2, -2, 10, 1, -1, 0]
Matrix = np.array(
[ #Cl1 Cl2 Cl3 Pas Pub Ply Slp
[0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], # Class1
[0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2], # CLass2
[0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0], # Class3
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # Pass
[0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0], # Pub
[0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0], # Play
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] # Sleep
]
)

Просмотреть файл

@ -0,0 +1,59 @@
from enum import Enum
import numpy as np
# 状态
class States(Enum):
Rest = 0
Game = 1
Class1 = 2
Class2 = 3
Class3 = 4
# 动作
class Actions(Enum):
Quit = 0
Play1 = 1
Play2 = 2
Study1 = 3
Study2 = 4
Pass = 5
Pub = 6
Sleep= 7
# 动作奖励
Rewards = [0, -1, -1, -2, -2, 10, 1, 0]
# 状态->动作概率
Pi_sa = np.array([
# S_Rest -> A_none
[0, 0, 0, 0, 0, 0, 0, 0],
# S_Game -> A_Quit, A_Play1
[0.5, 0.5, 0, 0, 0, 0, 0, 0],
# S_Class1 -> A_Play2, A_Study1
[0, 0, 0.5, 0.5, 0, 0, 0, 0],
# S_Class2 -> A_Study2, A_Sleep
[0, 0, 0, 0, 0.5, 0, 0, 0.5],
# S_Class3 -> A_Pass, A_Pub
[0, 0, 0, 0, 0, 0.5, 0.5, 0]
])
# 动作->状态概率
P_as = np.array([
# A_Quit -> S_Class1
[0, 0, 1, 0, 0],
# A_Play1 -> S_Game
[0, 1, 0, 0, 0],
# A_Play2 -> S_Game
[0, 1, 0, 0, 0],
# A_Study1 -> S_Class2
[0, 0, 0, 1, 0],
# A_Study2 -> S_Class3
[0, 0, 0, 0, 1],
# A_Pass -> S_Rest
[1, 0, 0, 0, 0],
# A_Pub -> S_Class1, S_Class2, S_Class3
[0, 0, 0.2, 0.4, 0.4],
# A_Sleep -> S_None
[0, 0, 0, 0, 0]
])

Просмотреть файл

@ -0,0 +1,175 @@
import numpy as np
from enum import Enum
# 状态
class States(Enum):
Goal0 = 0
Safe1 = 1
Hole2 = 2
Safe3 = 3
Safe4 = 4
Safe5 = 5
# 动作 对于4x4的方格有正反48个动作再减去进入终点后不能返回的数量
class Actions(Enum):
a0001=0x0001
a0102=0x0102
a0100=0x0100
a0201=0x0201
a0004=0x0004
a0400=0x0400
a0105=0x0105
a0501 = 0x0501
a0206=0x0206
a0602 = 0x0602
a0405 = 0x0405
a0506 = 0x0506
a0504 = 0x0504
a0605 = 0x0605
# 向前走动作F时
# 到达前方s的概率是0.7,
# 滑到左侧的概率是0.2,
# 滑到左侧的概率是0.1,
# 如果是边角,前方概率不变,越界时呆在原地
Front = 0.7
Left = 0.2
Right = 0.1
# Reward
Hole = -1
Goal = 5
# 数据在数组中的位置语义
Action = 0
ActionPi = 1
Reward = 2
StateProbs = 3
P=[
[ # state 0: action, pi, reward, [state, prob]
#[0x0000, 1, Goal, [[0, 1]]],
],
[ # state 1: action, prob, reward, [state, prob]
[0x0100, 1/3, 0, [[0, Front],[4, Left],[1, Right]]],
[0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
[0x0104, 1/3, 0, [[4, Front],[2, Left],[0, Right]]]
],
[ # state 2: action, prob, reward, [state, prob]
#[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
#[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
#[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
#[0x0202, 1, Hole, [[2, 1]]]
],
#############
[ # state 3: action, prob, reward, [state, prob]
[0x0300, 1/2, 0, [[0, Front],[3, Left],[4, Right]]],
[0x0304, 1/2, 0, [[4, Front],[0, Left],[3, Right]]],
],
[ # state 4: action, prob, reward, [state, prob]
[0x0401, 1/3, 0, [[1, Front],[3, Left],[5, Right]]],
[0x0403, 1/3, 0, [[3, Front],[4, Left],[1, Right]]],
[0x0405, 1/3, 0, [[5, Front],[1, Left],[4, Right]]],
],
[ # state 5: action, prob, reward, [state, prob]
[0x0502, 1/2, Hole, [[2, Front],[4, Left],[5, Right]]],
[0x0504, 1/2, 0, [[4, Front],[5, Left],[2, Right]]],
],
]
class DataParser(object):
def get_next_actions(self, curr_state):
actions_data = P[curr_state.value]
#print(actions_data)
return actions_data
def get_action_pi_reward(self, action_data):
return action_data[Action], action_data[ActionPi], action_data[Reward]
def get_action_states_probs(self, action_data):
return action_data[StateProbs]
def get_next_states_probs(self, action):
for state in P:
for actions_data in state:
if (actions_data[Action] == action):
return actions_data[Reward], actions_data[StateProbs]
return None, None
def V_pi(States, dataParser, gamma):
num_state = 6
V_curr = [0.0] * num_state
V_next = [0.0] * num_state
count = 0
# 迭代
while (True):
# 遍历所有状态 s
for curr_state in States:
v_curr_sum = 0
# 获得 状态->动作 策略概率
actions_data = dataParser.get_next_actions(curr_state)
# 遍历每个策略概率
for action_data in actions_data:
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
# 获得 动作->状态 转移概率
next_states_probs = dataParser.get_action_states_probs(action_data)
#next_states_prob = P_as[action_value]
v_sum = 0
# 遍历每个转移概率
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
v_sum += next_state_prob * V_next[next_state_value]
#end for
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
v_curr_sum += next_action_prob * (reward + gamma * v_sum)
# end for
V_curr[curr_state.value] = v_curr_sum
#endfor
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next 迭代
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next
def Q2_pi(Actions, dataParser, gamma, vs):
Q = {}
# 遍历每个action
for curr_action in Actions:
q_sum = 0
# 获得 动作->状态 转移概率
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
if (reward is None):
continue
#next_states_probs = P_as[curr_action.value]
# 遍历每个转移概率求和
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
q_sum += next_state_prob * vs[next_state_value]
# end for
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
q = reward + gamma * q_sum
Q[curr_action.name] = q
#endfor
return Q
if __name__=="__main__":
gamma = 0.9
dataParser = DataParser()
vs = V_pi(States, dataParser, gamma)
print(np.round(np.array(vs).reshape(2,3), 2))
Q = Q2_pi(Actions, dataParser, gamma, vs)
for q in Q:
print(q, "={:.4f}".format(Q[q]))

Просмотреть файл

@ -0,0 +1,142 @@
import Data_FrozenLake2 as dfl2
import numpy as np
def V_star(States, dataParser, gamma):
num_state = len(States)
V_curr = [0.0] * num_state
V_next = [0.0] * num_state
count = 0
# 迭代
while (True):
# 遍历所有状态 s
for curr_state in States:
list_v = []
# 获得 状态->动作 策略概率
next_actions_datas = dataParser.get_next_actions(curr_state.value)
for next_action_data in next_actions_datas:
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(next_action_data)
# 获得 动作->状态 转移概率
next_states_probs = dataParser.get_action_states_probs(next_action_data)
#next_states_prob = P_as[action_value]
v_sum = 0
# 遍历每个转移概率
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
v_sum += next_state_prob * V_next[next_state_value]
#end for
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
list_v.append(reward + gamma * v_sum)
# end for
if (len(list_v) > 0):
V_curr[curr_state.value] = max(list_v)
#endfor
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next
def Q_star(Actions, dataParser, gamma):
num_action = len(Actions)
Q_curr = [0.0] * num_action
Q_next = [0.0] * num_action
count = 0
# 迭代
while (count < 100):
# 遍历每个action
for curr_action in Actions:
q_curr_sum = 0
# 获得 动作->状态 转移概率
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
if (reward is None):
continue
# 遍历每个转移概率求和
for [next_state_value, next_state_prob] in next_states_probs:
# 获得 状态->动作 策略概率
actions_datas = dataParser.get_next_actions(next_state_value)
list_q = []
# 求最大值
for action_data in actions_datas:
action, _, _ = dataParser.get_action_pi_reward(action_data)
list_q.append(Q_next[action])
#end for
# math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a')
if (len(list_q) > 0):
q_curr_sum += next_state_prob * max(list_q)
# end for
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
Q_curr[curr_action.value] = reward + gamma * q_curr_sum
#endfor
# 检查收敛性
if np.allclose(Q_next, Q_curr):
break
# 把 Q_curr 赋值给 Q_next
Q_next = Q_curr.copy()
count += 1
# end while
print(count)
return Q_next
def Q_star_from_V_star(Actions, dataParser, gamma, v_star):
Q_star = {}
# 遍历每个action
for curr_action in Actions:
q_sum = 0
# 获得 动作->状态 转移概率
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
if (reward is None):
continue
# 遍历每个转移概率求和
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{a'} P_{ss'}^a v_{*}(s')
q_sum += next_state_prob * v_star[next_state_value]
# end for
# math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
Q_star[curr_action.name] = reward + gamma * q_sum
#endfor
return sorted(Q_star.items())
def find_next_best(Q, start):
action = None
value = None
for q in Q:
if (q[0].startswith(start)):
if action is None:
action = q[0]
value = q[1]
else:
if (q[1] > value):
action = q[0]
value = q[1]
return action, value
if __name__=="__main__":
gamma = 0.9
dataParser = dfl2.DataParser()
vs = V_star(dfl2.States, dataParser, gamma)
print(np.round(np.array(vs).reshape(4,4), 2))
Q_star = Q_star_from_V_star(dfl2.Actions, dataParser, gamma, vs)
for q in Q_star:
print(q)
start = "a00"
count = 0
while(True):
action, value = find_next_best(Q_star, start)
print(action, value)
if (action is None):
break
start = "a" + action.replace(start, "")
count +=1
if (count > 8):
break

Просмотреть файл

@ -0,0 +1,73 @@
import Data_FrozenLake2 as dfl2
import numpy as np
def V_pi(States, dataParser, gamma):
num_state = 16
V_curr = [0.0] * num_state
V_next = [0.0] * num_state
count = 0
# 迭代
while (True):
# 遍历所有状态 s
for curr_state in States:
v_curr_sum = 0
# 获得 状态->动作 策略概率
actions_data = dataParser.get_next_actions(curr_state)
# 遍历每个策略概率
for action_data in actions_data:
next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
# 获得 动作->状态 转移概率
next_states_probs = dataParser.get_action_states_probs(action_data)
#next_states_prob = P_as[action_value]
v_sum = 0
# 遍历每个转移概率
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
v_sum += next_state_prob * V_next[next_state_value]
#end for
# math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')]
v_curr_sum += next_action_prob * (reward + gamma * v_sum)
# end for
V_curr[curr_state.value] = v_curr_sum
#endfor
# 检查收敛性
if np.allclose(V_next, V_curr):
break
# 把 V_curr 赋值给 V_next 迭代
V_next = V_curr.copy()
count += 1
# end while
print(count)
return V_next
def Q2_pi(Actions, dataParser, gamma, vs):
Q = {}
# 遍历每个action
for curr_action in Actions:
q_sum = 0
# 获得 动作->状态 转移概率
reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
if (reward is None):
continue
#next_states_probs = P_as[curr_action.value]
# 遍历每个转移概率求和
for [next_state_value, next_state_prob] in next_states_probs:
# math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
q_sum += next_state_prob * vs[next_state_value]
# end for
# math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
q = reward + gamma * q_sum
Q[curr_action.name] = q
#endfor
return Q
if __name__=="__main__":
gamma = 0.9
dataParser = dfl2.DataParser()
vs = V_pi(dfl2.States, dataParser, gamma)
print(np.round(np.array(vs).reshape(4,4), 2))
Q = Q2_pi(dfl2.Actions, dataParser, gamma, vs)
for q in Q:
print(q, "={:.4f}".format(Q[q]))

Просмотреть файл

@ -0,0 +1,24 @@
import Algorithm_MDP_Star as algoMS
import Data_Students2 as ds2
def Student_V_star(gamma):
v = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
for start_state in ds2.States:
print(start_state, "= {:.1f}".format(v[start_state.value]))
def Student_Q_star(gamma):
v = algoMS.Q_star(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
for action in ds2.Actions:
print(action, "= {:.1f}".format(v[action.value]))
def Student_Q_from_V_star(gamma):
v_star = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
q_star = algoMS.Q_star_from_V_star(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_star)
for action in ds2.Actions:
print(action, "= {:.1f}".format(q_star[action.value]))
if __name__=="__main__":
gamma = 1
Student_V_star(gamma)
Student_Q_star(gamma)
Student_Q_from_V_star(gamma)

Просмотреть файл

@ -0,0 +1,25 @@
import Data_Students2 as ds2
import Algorithm_MDP as mba
def Student_V_Pi(gamma):
v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
for state in ds2.States:
print(state, "= {:.1f}".format(v_pi[state.value]))
return v_pi
def Student_Q_Pi(gamma):
q_pi = mba.Q_pi(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
for action in ds2.Actions:
print(action, "= {:.1f}".format(q_pi[action.value]))
def Student_Q_Pi_From_V_Pi(gamma):
v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
q_pi = mba.Q_pi_from_V_pi(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_pi)
for action in ds2.Actions:
print(action, "= {:.1f}".format(q_pi[action.value]))
if __name__=="__main__":
gamma = 1
Student_V_Pi(gamma)
Student_Q_Pi(gamma)
Student_Q_Pi_From_V_Pi(gamma)

Просмотреть файл

@ -0,0 +1,27 @@
import numpy as np
import Algorithm_MPR as algoM
import Data_FrozenLake as dfl
def FrozenLake_MentoCarol(gamma):
episodes = 20000
end_states = [dfl.States.Hole2, dfl.States.Hole8, dfl.States.Hole10, dfl.States.Goal15]
vs = algoM.MonteCarol(dfl.Rewards, dfl.Matrix, dfl.States, end_states, gamma, episodes)
print(np.round(np.array(vs).reshape(4,4), 2))
def FrozenLake_Matrix(gamma):
vs = algoM.Matrix(dfl, gamma)
print(np.round(np.array(vs).reshape(4,4), 2))
def FrozenLake_Bellman(gamma):
vs = algoM.Bellman(dfl.States, dfl.Matrix, dfl.Rewards, gamma)
np.set_printoptions(suppress=True)
print(np.round(np.array(vs).reshape(4,4), 2))
if __name__=="__main__":
gamma = 1
print(gamma)
#FrozenLake_MentoCarol(gamma)
FrozenLake_Matrix(gamma)
FrozenLake_Bellman(gamma)

Просмотреть файл

@ -0,0 +1,30 @@
import Data_Student as ds
import Algorithm_MPR as algoM
import numpy as np
def Student_MonteCarol(gamma):
episodes = 10000
end_states = [ds.States.Sleep]
v = algoM.MonteCarol(ds.Rewards, ds.Matrix, ds.States, end_states, gamma, episodes)
for start_state in ds.States:
print(start_state, "= {:.2f}".format(v[start_state.value]))
def InvMatrix(gamma):
v = algoM.Matrix(ds, gamma)
for start_state in ds.States:
print(start_state, "= {:.2f}".format(v[start_state.value]))
return v
def Bellman(gamma):
v = algoM.Bellman(ds.States, ds.Matrix, ds.Rewards, gamma)
for start_state in ds.States:
print(start_state, "= {:.2f}".format(v[start_state.value]))
if __name__=="__main__":
gamma = 0.9
#Student_MonteCarol(gamma)
InvMatrix(gamma)
Bellman(gamma)

Просмотреть файл