Xiaowu/20220123 (#712)

* update * Update MRP-1.py * up * uo * uui * up * update * oo * ui * ui * Update MDP_FrozenLake_Optimal.py
2022-02-07 13:46:19 +08:00 · 2022-02-07 13:46:19 +08:00 · 0c91801d6e
--- a/基础教程/A7-强化学习/01-优化与强化学习/src/ThreeDoors.py
+++ b/基础教程/A7-强化学习/01-优化与强化学习/src/ThreeDoors.py
--- a/基础教程/A7-强化学习/02-探索与利用/formula.md
+++ b/基础教程/A7-强化学习/02-探索与利用/formula.md
--- a/基础教程/A7-强化学习/02-探索与利用/imges/E-Greedy.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/E-Greedy.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/Greedy.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/Greedy.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-1.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-1.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-2.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-2.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-3.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-3.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/Thompson.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/Thompson.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/compare_1.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/compare_1.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/compare_2.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/compare_2.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/random.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/random.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/softmax-1.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-1.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/softmax-2.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-2.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/softmax-3.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-3.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-1.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-1.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-10.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-10.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-2.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-2.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-3.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-3.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-4.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-4.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-5.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-5.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-6.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-6.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-7.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-7.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-8.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-8.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/tps-9.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/tps-9.png
--- a/基础教程/A7-强化学习/02-探索与利用/imges/ucb.png
+++ b/基础教程/A7-强化学习/02-探索与利用/imges/ucb.png
--- a/基础教程/A7-强化学习/02-探索与利用/src/init.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/init.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/armit.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/armit.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_20_base.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_20_base.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_21_random.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_21_random.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_22_greedy.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_22_greedy.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_23_e_greedy.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_23_e_greedy.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_24_optimistic_initial.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_24_optimistic_initial.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax_test.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax_test.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_UCB.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_UCB.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_ucb_test.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_ucb_test.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson_test.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson_test.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/bandit_28_all.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/bandit_28_all.py
--- a/基础教程/A7-强化学习/02-探索与利用/src/test.py
+++ b/基础教程/A7-强化学习/02-探索与利用/src/test.py
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md
@ -0,0 +1,299 @@
+
+$$
+p(s'|s,a) = \Pr \{S_t=s'|S_{t-1}=s,A_{t-1}=a\}
+$$
+
+$$
+\sum_{i=0}^n p(s_i'|s_j,a) = 1
+$$
+
+$$
+p(s_1'|s_j,a) + p(s_2'|s_j,a) + p(s_3'|s_j,a)  = 1
+$$
+
+$$
+P = 
+\begin{bmatrix}
+p(s_1|s_1) & p(s_1|s_2) & \cdots & p(s_1|s_n)
+\\
+p(s_2|s_1) & p(s_2|s_2) & \cdots & p(s_2|s_n)
+\\
+\vdots & \vdots & \ddots & \vdots
+\\
+p(s_n|s_1) & p(s_n|s_2) & \cdots & p(s_n|s_n)
+\end{bmatrix}
+$$
+
+奖励函数
+
+$$
+R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s,A_{t-1}=a ]
+$$
+
+$$
+R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s ]
+$$
+
+
+$$
+\begin{aligned}
+G_t &= R_{t+1} + \gamma R_{t+2}  + \gamma^2 R_{t+3} + \cdots +  \gamma^{T-t-1} R_{T}
+\\
+&= \sum_{k=0}^{T} \gamma^k R_{t+k+1}, \ 0 \le \gamma \le 1
+\end{aligned}
+$$
+
+$$
+\begin{aligned}
+G_t &= R_{t+1} + \gamma R_{t+2}  + \gamma^2 R_{t+3} + \gamma^3 R_{t+4} + \cdots
+\\
+&= R_{t+1} + \gamma (R_{t+2}  + \gamma R_{t+3} + \gamma^{2} R_{t+4}+\cdots)
+\\
+&=R_{t+1} + \gamma G_{t+1}
+\end{aligned}
+$$
+
+$$
+R_s = \mathbb{E} [R_{t+1} | S_t=s]
+$$
+
+$$
+\begin{aligned}
+V(s) &= \mathbb{E} [G_t \ | \ S_t=s]
+\\
+&=\mathbb{E} [R_{t+1} + \gamma R_{t+2}  + \gamma^2 R_{t+3} + \cdots \ | \ S_t=s]
+\\
+&=\mathbb{E} [R_{t+1} + \gamma G_{t+1} \ | \ S_t=s]
+\\
+&=\mathbb{E} [R_{t+1}] + \gamma \mathbb{E} [G_{t+1}|S_t=s]
+\\
+&=R_{t+1} + \gamma V(s_{t+1}) 
+\end{aligned}
+$$
+
+$$
+V(Class3) = 4.09
+\\
+\begin{aligned}
+X&=R_{Class3}+\gamma*[V(Pub)*P(S_{Class3}|S_{Pub}) 
+\\
+&+ V(A_{Pass})*P(S_{Class3}|S_{A_{Pass}})]
+\\
+&=(-2)+0.9*(1.93*0.4+10*0.6)=4.09
+\end{aligned}
+\\
+V(Class3) == X
+$$
+
+$$
+V(s) = R_s + \gamma * \sum V(s') P(s|s')
+$$
+
+$$
+V(s)=R_s + \gamma \sum_{s' \in S} Pss' \cdot V(s')
+$$
+
+$$
+V(s)=R_s + \gamma * [p_1V(s'_1) + p_2V(s'_2) + p_3V(s'_3)]
+$$
+
+矩阵形式
+
+$$
+V = R + \gamma PV
+$$
+
+$$
+\begin{bmatrix}
+V(1)
+\\
+V(2)
+\\
+\vdots
+\\
+V(n)
+\end{bmatrix}
+=\
+\begin{bmatrix}
+R_1
+\\
+R_2
+\\
+\vdots
+\\
+R_n
+\end{bmatrix}
+\gamma
+\begin{bmatrix}
+P_{11} & P_{12} & \cdots & P_{1n}
+\\
+P_{21} & P_{22} & \cdots & P_{2n}
+\\
+\vdots & \vdots & \ddots & \vdots
+\\
+P_{n1} & P_{n2} & \cdots & P_{nn}
+\end{bmatrix}
+\begin{bmatrix}
+V(1)
+\\
+V(2)
+\\
+\vdots
+\\
+V(n)
+\end{bmatrix}
+$$
+
+$$
+V - \gamma PV = R
+\\
+(1-\gamma {})V = R
+\\
+V = (I - \gamma P)^{-1} R
+$$
+
+策略价值函数
+
+$$
+v_{\pi}(s)=\mathbb {E}_{\pi} [ G_t |S_t=s]
+$$
+
+$$
+\begin{aligned}
+v_{\pi}(s)&=\sum_{a \in A} \pi(a|s) q_\pi(s,a)
+\\
+&=\pi(a_1|s) q_{\pi}(s,a_1)+\pi(a_2|s) q_{\pi}(s,a_2)+\pi(a_3|s) q_{\pi}(s,a_3)
+\end{aligned}
+$$
+
+策略动作函数
+
+$$
+q_{\pi}(s,a)=\mathbb E_{\pi} [G_t | S_t=s, A_t=a]
+$$
+
+
+$$
+\begin{aligned}
+q_{\pi}(s,a)&=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s')
+\\
+&= R_s^a + \gamma [P_1 v_{\pi}(s'_1)+P_2 v_{\pi}(s'_2)]
+\end{aligned}
+$$
+
+$$
+v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
+$$
+
+$$
+q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} \sum_{a' \in A} \pi(a'|s') q_\pi(s',a')
+$$
+
+$$
+\begin{aligned}
+V_1 &= \pi(A_{Play}|S_{V_1})*(R_{Play}+\gamma P_{11}V_1)+\pi(A_{Quit}|S_{V_2})*(R_{Quit}+\gamma P_{12}V_2)
+\\
+V_2 &= \pi(A_{Play}|S_{V_2})*(R_{Play}+\gamma P_{21}V_1)+\pi(A_{Study1}|S_{V_2})*(R_{Study1}+\gamma P_{23}V_3)
+\\
+V_3 &= \pi(Sleep|S_{V_3})*(R_{Sleep}+\gamma P_{30}V_0)+\pi(A_{Study2}|S_{V_3})*(R_{Study2}+\gamma P_{34}V_4)
+\\
+V_4 &= \pi(A_{Pass}|S_{V_4})*(R_{Pass}+\gamma P_{40}V_0)+\pi(A_{Pub}|S_{V_4})*(R_{Pub}+\gamma P_{42}V_2+\gamma P_{43}V_3+\gamma P_{44}V_4)
+\end{aligned}
+$$
+
+根据公式 
+
+$$
+V_*(s) = \underset{a}{\max} [R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')]
+$$
+
+$$
+\gamma=1
+\\
+V_{Rest}=V0=0
+\\
+V_{Game} = V1 = \max (-1+V1, 0+V2)
+\\
+V_{Class1}=V2=\max (-1+V1, -2+V3)
+\\
+V_{Class2}=V3 = \max (0 + V0, -2+V4)
+\\
+V_{Class3} = V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
+$$
+
+解上述方程组
+由于
+$$
+x = \max (x+a, b)  
+$$
+
+其中 a,b为常数时，可以推论 $x=b$。所以
+
+$$
+V1 = \max (-1+V1, 0+V2)=V2
+\\
+V2=\max (-1+V1, -2+V3)=\max (-1+V2, -2+V3)=V3-2=V1
+\\
+V3 = V1+2
+\\
+V3 = \max (0 + V0, -2+V4)=V4-2
+\\
+V4 = V3+2=V1+4
+\\
+V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3)
+$$
+
+把所有的变量都换成V1
+$$
+V4 = \max (10, 1+0.2V1+0.4V1+0.4(V1+2))=\max (10, V1+1.8)=\max(10,V4-2.2)
+$$
+
+所以
+$$
+V4=10
+\\
+V1=V4-4=6
+\\
+V2=V1=6
+\\
+V3=V1+2=8
+$$
+
+根据
+
+
+$$
+v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big]
+$$
+
+$$
+\gamma=1
+$$
+
+$$
+V0=0
+\\
+V1=0.5(-1+1*1*V1)+0.5(0+1*1*V2)=0.5V1+0.5V2-0.5
+\\
+V2=0.5(-1+1*1*V1)+0.5(-2+1*1*V3)=0.5V1+0.5V3-1.5
+\\
+V3=0.5(0+1*1*0)+0.5(-2+1*1*V4)=0.5V4-1
+\\
+V4=0.5(10+1*1*0)+0.5(1+1*0.2*V2+1*0.4*V3+1*0.4*V4)=0.1V2+0.2V3+0.2V4+5.5
+$$
+
+解方程组得到
+
+$$
+V3=2.7
+\\
+V2=-1.3
+\\
+V1=-2.3
+\\
+V4=7.4
+$$
+
+$$
+Q_*(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')
+$$
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py
@ -0,0 +1,97 @@
+import numpy as np
+
+def V_pi(States, Pi_sa, P_as, Rewards, gamma):
+    num_state = len(States)
+    V_curr = [0.0] * num_state
+    V_next = [0.0] * num_state
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历所有状态 s
+        for curr_state in States:
+            v_curr_sum = 0
+            # 获得 状态->动作 策略概率
+            next_actions_prob = Pi_sa[curr_state.value]
+            # 遍历每个策略概率
+            for action_value, action_prob in enumerate(next_actions_prob):
+                # 获得 动作->状态 转移概率
+                next_states_prob = P_as[action_value]
+                v_sum = 0
+                # 遍历每个转移概率
+                for state_value, state_prob in enumerate(next_states_prob):
+                    # math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
+                    v_sum += state_prob * V_next[state_value]
+                #end for
+                # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] 
+                v_curr_sum += action_prob * (Rewards[action_value] + gamma * v_sum)
+            # end for
+            
+            V_curr[curr_state.value] = v_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
+
+
+def Q_pi(Actions, Pi_sa, P_as, Rewards, gamma):
+    num_action = len(Actions)
+    Q_curr = [0.0] * num_action
+    Q_next = [0.0] * num_action
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历每个action
+        for curr_action in Actions:
+            q_curr_sum = 0
+            # 获得 动作->状态 转移概率
+            next_states_prob = P_as[curr_action.value]
+            # 遍历每个转移概率求和
+            for state_value, state_prob in enumerate(next_states_prob):
+                # 获得 状态->动作 策略概率
+                next_actions_prob = Pi_sa[state_value]
+                q_sum = 0
+                # 遍历每个动作概率求和
+                for action_value, action_prob in enumerate(next_actions_prob):
+                    # math: \sum_{a'} \pi(a'|s')*q_{\pi}(s',a')
+                    q_sum += action_prob * Q_next[action_value]
+                #end for
+                # math: \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
+                q_curr_sum += state_prob * q_sum
+            # end for
+            # math: q_{\pi}(s,a)=R_s^a + \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') )
+            Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(Q_next, Q_curr):
+            break
+        # 把 Q_curr 赋值给 Q_next
+        Q_next = Q_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return Q_next
+
+
+def Q_pi_from_V_pi(Actions, P_as, Rewards, gamma, v_pi):
+    num_action = len(Actions)
+    Q = [0.0] * num_action
+    # 遍历每个action
+    for curr_action in Actions:
+        q_sum = 0
+        # 获得 动作->状态 转移概率
+        next_states_probs = P_as[curr_action.value]
+        # 遍历每个转移概率求和
+        for next_state_value, next_state_prob in enumerate(next_states_probs):
+            # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') 
+            q_sum += next_state_prob * v_pi[next_state_value]
+        # end for
+        # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
+        Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
+    #endfor
+    return Q
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py
@ -0,0 +1,106 @@
+import numpy as np
+
+# state value function
+def V_star(States, Pi_sa, P_as, Rewards, gamma):
+    num_state = len(States)
+    V_curr = [0.0] * num_state
+    V_next = [0.0] * num_state
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历所有状态 s
+        for curr_state in States:
+            list_v = []
+            # 获得 状态->动作 策略概率
+            next_actions_probs = Pi_sa[curr_state.value]
+            # 遍历每个策略概率
+            for action_value, action_prob in enumerate(next_actions_probs):
+                if (action_prob > 0.0):
+                    # 获得 动作->状态 转移概率
+                    next_states_probs = P_as[action_value]
+                    v_sum = 0
+                    # 遍历每个转移概率
+                    for state_value, state_prob in enumerate(next_states_probs):
+                        # math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
+                        v_sum += state_prob * V_next[state_value]
+                    #end for
+                    # math: \max [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] 
+                    list_v.append(Rewards[action_value] + gamma * v_sum)
+            # end for
+            if (len(list_v) > 0):
+                V_curr[curr_state.value] = max(list_v)
+        #endfor
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
+
+# action value function
+def Q_star(Actions, Pi_sa, P_as, Rewards, gamma):
+    num_action = print(len(Actions))
+    Q_curr = [0.0] * num_action
+    Q_next = [0.0] * num_action
+    count = 0
+    # 迭代
+    while (count < 100):
+        # 遍历每个action
+        for curr_action in Actions:
+            q_curr_sum = 0
+            if (curr_action == Actions.Sleep):
+                continue
+            # 获得 动作->状态 转移概率
+            next_states_probs = P_as[curr_action.value]
+            # 遍历每个转移概率求和
+            for state_value, state_prob in enumerate(next_states_probs):
+                # 获得 状态->动作 策略概率
+                next_actions_probs = Pi_sa[state_value]
+                list_q = []
+                # 遍历每个动作概率求和
+                for next_action_value, next_action_prob in enumerate(next_actions_probs):
+                    if (next_action_prob > 0.0):
+                        # math: q_{\pi}(s',a')
+                        list_q.append(Q_next[next_action_value])
+                #end for
+                # math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') 
+                if (len(list_q) > 0):
+                    q_curr_sum += state_prob * max(list_q) 
+            # end for
+            # math: R_s^a + \gamma  ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
+            Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(Q_next, Q_curr):
+            break
+        # 把 Q_curr 赋值给 Q_next
+        Q_next = Q_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return Q_next
+
+
+# math: q_*(s,a) = R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_*(s')
+def Q_star_from_V_star(Actions, P_as, Rewards, gamma, v_star):
+    num_action = print(len(Actions))
+    Q = [0.0] * num_action
+    # 遍历每个action
+    for curr_action in Actions:
+        q_sum = 0
+        if (curr_action == Actions.Sleep):
+            continue
+        # 获得 动作->状态 转移概率
+        next_states_probs = P_as[curr_action.value]
+        # 遍历每个转移概率求和
+        for next_state_value, next_state_prob in enumerate(next_states_probs):
+            # math: \sum_{a'} P_{ss'}^a v_{*}(s') 
+            q_sum += next_state_prob * v_star[next_state_value]
+        # end for
+        # math: R_s^a + \gamma  ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
+        Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum
+    #endfor
+    return Q
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py
@ -0,0 +1,88 @@
+import math
+import numpy as np
+import tqdm
+import multiprocessing as mp
+
+def mc_single_process(
+    Rewards, TransMatrix, States, 
+    start_state, end_states, episodes, gamma):
+    num_state = len(Rewards)
+    sum_gain = 0
+    for episode in tqdm.trange(episodes):
+        if (start_state in end_states):
+            # 最后一个状态也可能有reward值
+            return Rewards[start_state.value]
+        curr_state_value = start_state.value
+        gain = Rewards[curr_state_value]
+        power = 1
+        while (True):
+            next_state_value = np.random.choice(
+                num_state, p=TransMatrix[curr_state_value])
+            r = Rewards[next_state_value]
+            gain += math.pow(gamma, power) * r
+            if (States(next_state_value) in end_states):
+                # 到达终点，分幕结束
+                break
+            else:
+                power += 1
+                curr_state_value = next_state_value
+        # end while
+        sum_gain += gain
+    # end for
+    v = sum_gain / episodes
+    return v  
+
+# 蒙特卡洛采样法
+def MonteCarol(Rewards, TransMatrix, States, end_states, gamma, episodes):
+    pool = mp.Pool(processes=6)
+    Vs = []
+    results = []
+    for start_state in States:
+        results.append(pool.apply_async(mc_single_process, 
+        args=(Rewards, TransMatrix, States, start_state, end_states, episodes, gamma,)))
+    pool.close()
+    pool.join()
+    for i in range(len(results)):
+        v = results[i].get()
+        Vs.append(v)
+
+    return Vs
+
+# 矩阵法
+def Matrix(ds, gamma):
+    num_state = ds.Matrix.shape[0]
+    I = np.eye(num_state)
+    tmp1 = I - gamma * ds.Matrix
+    tmp2 = np.linalg.inv(tmp1)
+    vs = np.dot(tmp2, ds.Rewards)
+
+    return vs
+
+# 贝尔曼方程迭代
+def Bellman(States, TransMatrix, Rewards, gamma):
+    num_states = len(Rewards)
+    V_curr = [0.0] * num_states
+    V_next = [0.0] * num_states
+    count = 0
+    while (count < 1000):
+        # 遍历每一个 state 作为 start_state
+        for start_state in States:
+            # 得到转移概率
+            next_states_probs = TransMatrix[start_state.value]
+            v_sum = 0
+            # 计算下一个状态的 转移概率*状态值 的 和 v
+            for next_state_value, next_state_prob in enumerate(next_states_probs):
+                # if (prob[next_state] > 0.0):
+                v_sum += next_state_prob * V_next[next_state_value]
+            # end for
+            V_curr[start_state.value] = Rewards[start_state.value] + gamma * v_sum
+        # end for
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py
@ -0,0 +1,115 @@
+import numpy as np
+from enum import Enum
+
+# 状态
+class States(Enum):
+    Start = 0
+    Safe1 = 1
+    Hole2 = 2
+    Safe3 = 3
+    Safe4 = 4
+    Safe5 = 5
+    Safe6 = 6
+    Safe7 = 7
+    Hole8 = 8
+    Safe9 = 9
+    Hole10 = 10
+    Safe11 = 11
+    Safe12 = 12
+    Safe13 = 13
+    Safe14 = 14
+    Goal15 = 15
+
+# Reward
+Hole = -1
+Goal = 5
+
+# 状态奖励
+Rewards = [0, 0, Hole, 0,   
+           0, 0, 0, 0,   
+          Hole, 0, Hole, 0,
+           0, 0, 0, Goal]
+
+Matrix = np.array(
+    [   
+        [0.0, 1/2, 0.0, 0.0, 
+         1/2, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 0
+
+        [1/3, 0.0, 1/3, 0.0, 
+         0.0, 1/3, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 1
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 2
+
+        [0.0, 0.0, 1/2, 0.0, 
+         0.0, 0.0, 0.0, 1/2, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 3
+
+        [1/3, 0.0, 0.0, 0.0, 
+         0.0, 1/3, 0.0, 0.0, 
+         1/3, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 4
+
+        [0.0, 1/4, 0.0, 0.0, 
+         1/4, 0.0, 1/4, 0.0, 
+         0.0, 1/4, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 5
+
+        [0.0, 0.0, 1/4, 0.0, 
+         0.0, 1/4, 0.0, 1/4, 
+         0.0, 0.0, 1/4, 0.0,
+         0.0, 0.0, 0.0, 0.0],  # 6
+
+        [0.0, 0.0, 0.0, 1/3, 
+         0.0, 0.0, 1/3, 0.0, 
+         0.0, 0.0, 0.0, 1/3, 
+         0.0, 0.0, 0.0, 0.0],  # 7
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 8
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 1/4, 0.0, 0.0, 
+         1/4, 0.0, 1/4, 0.0, 
+         0.0, 1/4, 0.0, 0.0],  # 9
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 10
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 1/3, 
+         0.0, 0.0, 1/3, 0.0, 
+         0.0, 0.0, 0.0, 1/3],  # 11
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         1/2, 0.0, 0.0, 0.0, 
+         0.0, 1/2, 0.0, 0.0],  # 12
+
+        [0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 1/3, 0.0, 0.0, 
+         1/3, 0.0, 1/3, 0.0],  # 13
+
+        [0.0, 0.0, 0.0, 0.0,
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 1/3, 0.0, 
+         0.0, 1/3, 0.0, 1/3],  # 14
+
+        [0.0, 0.0, 0.0, 0.0,
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0, 
+         0.0, 0.0, 0.0, 0.0],  # 15, end state, no transform
+    ]
+)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py
@ -0,0 +1,223 @@
+from sunau import AUDIO_FILE_ENCODING_ADPCM_G723_3
+import numpy as np
+from enum import Enum
+
+# 状态
+class States(Enum):
+    Start = 0
+    Safe1 = 1
+    Hole2 = 2
+    Safe3 = 3
+    Safe4 = 4
+    Safe5 = 5
+    Safe6 = 6
+    Safe7 = 7
+    Hole8 = 8
+    Safe9 = 9
+    Hole10 = 10
+    Safe11 = 11
+    Safe12 = 12
+    Safe13 = 13
+    Safe14 = 14
+    Goal15 = 15
+    
+
+# 动作 对于4x4的方格，有正反48个动作（再减去进入终点后不能返回的数量）
+class Actions(Enum):
+    a0001=0x0001
+    a0102=0x0102
+    a0203=0x0203
+    a0100=0x0100
+    a0201=0x0201
+    a0302=0x0302
+
+    a0004=0x0004
+    a0400=0x0400
+    a0105=0x0105
+    a0501 = 0x0501 
+    a0206=0x0206
+    a0602 = 0x0602
+    a0307=0x0307
+    a0703 = 0x0703
+    
+    a0405 = 0x0405
+    a0506 = 0x0506
+    a0607 = 0x0607
+    a0504 = 0x0504
+    a0605 = 0x0605
+    a0706 = 0x0706
+
+    a0408 = 0x0408
+    a0804 = 0x0804
+    a0509 = 0x0509
+    a0905 = 0x0905
+    a0610 = 0x0610
+    a1006 = 0x1006
+    a0711 = 0x0711
+    a1107 = 0x1107
+    
+    a0809 = 0x0809
+    a0910 = 0x0910
+    a1011 = 0x1011
+    a1110 = 0x1110
+    a1009 = 0x1009
+    a0908 = 0x0908
+    
+    a0812 = 0x0812
+    a1208 = 0x1208
+    a0913 = 0x0913
+    a1309 = 0x1309
+    a1014 = 0x1014
+    a1410 = 0x1410
+    a1115 = 0x1115
+    a1511 = 0x1511
+
+    a1213 = 0x1213
+    a1314 = 0x1314
+    a1415 = 0x1415
+
+    a1312 = 0x1312 
+    a1413 = 0x1413
+    a1514 = 0x1514
+
+    
+# 向前走动作F时，
+# 到达前方s的概率是0.7, 
+# 滑到左侧的概率是0.2,
+# 滑到左侧的概率是0.1,
+# 如果是边角，前方概率不变，越界时呆在原地
+Front = 0.7
+Left = 0.2
+Right = 0.1
+# Reward
+Hole = -1
+Goal = 5
+
+# 数据在数组中的位置语义
+Action = 0
+ActionPi = 1
+Reward = 2
+StateProbs = 3
+
+P=[
+    [ # state 0: action, pi, reward, [state, prob]
+        [0x0001, 1/2, 0, [[1, Front],[0, Left],[4, Right]]],
+        [0x0004, 1/2, 0, [[4, Front],[1, Left],[0, Right]]]
+    ],
+    [ # state 1: action, prob, reward, [state, prob]
+        [0x0100, 1/3, 0, [[0, Front],[5, Left],[1, Right]]],
+        [0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
+        [0x0105, 1/3, 0, [[5, Front],[2, Left],[0, Right]]]
+    ],
+    [ # state 2: action, prob, reward, [state, prob]
+        #[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
+        #[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
+        #[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
+        [0x0202, 1, Hole, [[2, 1]]]
+    ],
+    [ # state 3: action, prob, reward, [state, prob]
+        [0x0302, 1/2, Hole, [[2, Front],[7, Left],[3, Right]]],
+        [0x0307, 1/2, 0, [[7, Front],[3, Left],[2, Right]]]
+    ],
+    #############
+    [ # state 4: action, prob, reward, [state, prob]
+        [0x0400, 1/3, 0, [[0, Front],[4, Left],[5, Right]]],
+        [0x0405, 1/3, 0, [[5, Front],[0, Left],[8, Right]]],
+        [0x0408, 1/3, Hole, [[8, Front],[5, Left],[4, Right]]]
+    ],
+    [ # state 5: action, prob, reward, [state, prob]
+        [0x0501, 1/4, 0, [[1, Front],[4, Left],[6, Right]]],
+        [0x0504, 1/4, 0, [[4, Front],[9, Left],[1, Right]]],
+        [0x0506, 1/4, 0, [[6, Front],[1, Left],[9, Right]]],
+        [0x0509, 1/4, 0, [[9, Front],[6, Left],[4, Right]]]
+    ],
+    [ # state 6: action, prob, reward, [state, prob]
+        [0x0602, 1/4, Hole, [[2, Front],[5, Left],[7, Right]]],
+        [0x0605, 1/4, 0, [[5, Front],[10, Left],[2, Right]]],
+        [0x0607, 1/4, 0, [[7, Front],[2, Left],[10, Right]]],
+        [0x0610, 1/4, Hole, [[10, Front],[5, Left],[7, Right]]],
+    ],
+    [ # state 7: action, prob, reward, [state, prob]
+        [0x0703, 1/3, 0, [[3, Front],[6, Left],[7, Right]]],
+        [0x0706, 1/3, 0, [[6, Front],[11, Left],[3, Right]]],
+        [0x0711, 1/3, 0, [[11, Front],[7, Left],[6, Right]]]
+    ],
+    ################
+    [ # state 8: action, prob, reward, [state, prob]
+        #[0x0804, 1/3, 0, [[4, Front],[8, Left],[9, Right]]],
+        #[0x0809, 1/3, 0, [[9, Front],[4, Left],[12, Right]]],
+        #[0x0812, 1/3, 0, [[12, Front],[9, Left],[8, Right]]]
+        [0x0808, 1, Hole, [[8, 1]]]
+    ],
+    [ # state 9: action, prob, reward, [state, prob]
+        [0x0905, 1/4, 0, [[5, Front],[8, Left],[10, Right]]],
+        [0x0908, 1/4, Hole, [[8, Front],[13, Left],[5, Right]]],
+        [0x0910, 1/4, Hole, [[10, Front],[5, Left],[13, Right]]],
+        [0x0913, 1/4, 0, [[13, Front],[10, Left],[8, Right]]]
+    ],
+    [ # state 10: action, prob, reward, [state, prob]
+        #[0x1006, 1/4, 0, [[6, Front],[9, Left],[11, Right]]],
+        #[0x1011, 1/4, 0, [[11, Front],[6, Left],[14, Right]]],
+        #[0x1014, 1/4, 0, [[14, Front],[11, Left],[9, Right]]],
+        #[0x1009, 1/4, 0, [[9, Front],[14, Left],[6, Right]]]
+        [0x1010, 1, Hole, [[10, 1]]]
+    ],
+    [ # state 11: action, prob, reward, [state, prob]
+        [0x1107, 1/3, 0, [[7, Front],[10, Left],[11, Right]]],
+        [0x1110, 1/3, Hole, [[10, Front],[15, Left],[7, Right]]],
+        [0x1115, 1/3, 0, [[15, Front],[15, Left],[10, Right]]]
+    ],
+    ###########
+    [ # state 12: action, prob, reward, [state, prob]
+        [0x1208, 1/2, Hole, [[8, Front],[12, Left],[13, Right]]],
+        [0x1213, 1/2, 0, [[13, Front],[8, Left],[12, Right]]]
+    ],
+    [ # state 13: action, prob, reward, [state, prob]
+        [0x1309, 1/3, 0, [[9, Front],[12, Left],[14, Right]]],
+        [0x1312, 1/3, 0, [[12, Front],[13, Left],[9, Right]]],
+        [0x1314, 1/3, 0, [[14, Front],[9, Left],[13, Right]]]
+    ],
+    [ # state 14: action, prob, reward, [state, prob]
+        [0x1410, 1/3, Hole, [[10, Front],[13, Left],[15, Right]]],
+        [0x1413, 1/3, 0, [[13, Front],[14, Left],[10, Right]]],
+        [0x1415, 1/3, Goal, [[15, Front],[10, Left],[14, Right]]]
+        #[0x1414, 1, Goal, [[14, 1]]]
+    ],
+    [ # state 15: action, prob, reward, [state, prob]
+        #[0x1511, 1/2, 0, [[15, Front],[14, Left], [15, Right]]],
+        #[0x1514, 1/2, 0, [[14, Front],[15, Left],[11, Right]]]
+        [0x1515, 1, Goal, [[15, 1]]]
+    ]
+
+]
+
+class DataParser(object):
+    def get_next_actions(self, curr_state):
+        actions_data = P[curr_state]
+        return actions_data
+
+    def get_action_pi_reward(self, action_data):
+        return action_data[Action], action_data[ActionPi], action_data[Reward]
+    
+    def get_action_states_probs(self, action_data):
+        return action_data[StateProbs]
+
+    def get_next_states_probs(self, action):
+        for state in P:
+            for actions_data in state:
+                if (actions_data[Action] == action):
+                    return actions_data[Reward], actions_data[StateProbs]
+        return None, None
+
+'''
+dataParser = DataParser()
+data = dataParser.get_next_actions(0)
+print(len(data))
+for i in range(len(data)):
+    a,p,r = dataParser.get_action_pi_reward(data[i])
+    print(a,p,r)
+    sp = dataParser.get_action_states_probs(data[i])
+    print(sp)
+'''
+
+
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py
@ -0,0 +1,30 @@
+
+import numpy as np
+from enum import Enum
+
+
+# 状态
+class States(Enum):
+    Class1 = 0
+    Class2 = 1
+    Class3 = 2
+    Pass = 3
+    Pub = 4
+    Play = 5
+    Sleep = 6
+
+# 收益向量
+# [Class1, Class2, Class3, Pass, Pub, Play, Sleep]
+Rewards = [-2, -2, -2, 10, 1, -1, 0]
+
+Matrix = np.array(
+    [   #Cl1  Cl2  Cl3  Pas  Pub  Ply  Slp
+        [0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], # Class1
+        [0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2], # CLass2
+        [0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0], # Class3
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # Pass
+        [0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0], # Pub
+        [0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0], # Play
+        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]  # Sleep
+    ]
+)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py
@ -0,0 +1,59 @@
+from enum import Enum
+import numpy as np
+
+# 状态
+class States(Enum):
+    Rest = 0
+    Game = 1
+    Class1 = 2
+    Class2 = 3
+    Class3 = 4
+
+# 动作
+class Actions(Enum):
+    Quit = 0
+    Play1 = 1
+    Play2 = 2
+    Study1 = 3
+    Study2 = 4
+    Pass = 5
+    Pub = 6
+    Sleep= 7
+
+# 动作奖励
+Rewards = [0, -1, -1, -2, -2, 10, 1, 0]
+
+# 状态->动作概率
+Pi_sa = np.array([
+    # S_Rest -> A_none
+    [0, 0, 0, 0, 0, 0, 0, 0],
+    # S_Game -> A_Quit, A_Play1
+    [0.5, 0.5, 0, 0, 0, 0, 0, 0],
+    # S_Class1 -> A_Play2, A_Study1
+    [0, 0, 0.5, 0.5, 0, 0, 0, 0],
+    # S_Class2 -> A_Study2, A_Sleep
+    [0, 0, 0, 0, 0.5, 0, 0, 0.5],
+    # S_Class3 -> A_Pass, A_Pub
+    [0, 0, 0, 0, 0, 0.5, 0.5, 0]
+])
+
+# 动作->状态概率
+P_as = np.array([
+    # A_Quit -> S_Class1
+    [0, 0, 1, 0, 0],
+    # A_Play1 -> S_Game
+    [0, 1, 0, 0, 0],
+    # A_Play2 -> S_Game
+    [0, 1, 0, 0, 0],
+    # A_Study1 -> S_Class2
+    [0, 0, 0, 1, 0],
+    # A_Study2 -> S_Class3
+    [0, 0, 0, 0, 1],
+    # A_Pass -> S_Rest
+    [1, 0, 0, 0, 0],
+    # A_Pub -> S_Class1, S_Class2, S_Class3
+    [0, 0, 0.2, 0.4, 0.4],
+    # A_Sleep -> S_None
+    [0, 0, 0, 0, 0]
+])
+
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py
@ -0,0 +1,175 @@
+import numpy as np
+from enum import Enum
+
+# 状态
+class States(Enum):
+    Goal0 = 0
+    Safe1 = 1
+    Hole2 = 2
+    Safe3 = 3
+    Safe4 = 4
+    Safe5 = 5
+    
+
+# 动作 对于4x4的方格，有正反48个动作（再减去进入终点后不能返回的数量）
+class Actions(Enum):
+    a0001=0x0001
+    a0102=0x0102
+    a0100=0x0100
+    a0201=0x0201
+
+    a0004=0x0004
+    a0400=0x0400
+    a0105=0x0105
+    a0501 = 0x0501 
+    a0206=0x0206
+    a0602 = 0x0602
+    
+    a0405 = 0x0405
+    a0506 = 0x0506
+    a0504 = 0x0504
+    a0605 = 0x0605
+
+    
+# 向前走动作F时，
+# 到达前方s的概率是0.7, 
+# 滑到左侧的概率是0.2,
+# 滑到左侧的概率是0.1,
+# 如果是边角，前方概率不变，越界时呆在原地
+Front = 0.7
+Left = 0.2
+Right = 0.1
+# Reward
+Hole = -1
+Goal = 5
+
+# 数据在数组中的位置语义
+Action = 0
+ActionPi = 1
+Reward = 2
+StateProbs = 3
+
+P=[
+    [ # state 0: action, pi, reward, [state, prob]
+        #[0x0000, 1, Goal, [[0, 1]]],
+    ],
+    [ # state 1: action, prob, reward, [state, prob]
+        [0x0100, 1/3, 0, [[0, Front],[4, Left],[1, Right]]],
+        [0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]],
+        [0x0104, 1/3, 0, [[4, Front],[2, Left],[0, Right]]]
+    ],
+    [ # state 2: action, prob, reward, [state, prob]
+        #[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]],
+        #[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]],
+        #[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]]
+        #[0x0202, 1, Hole, [[2, 1]]]
+    ],
+
+    #############
+    [ # state 3: action, prob, reward, [state, prob]
+        [0x0300, 1/2, 0, [[0, Front],[3, Left],[4, Right]]],
+        [0x0304, 1/2, 0, [[4, Front],[0, Left],[3, Right]]],
+    ],
+    [ # state 4: action, prob, reward, [state, prob]
+        [0x0401, 1/3, 0, [[1, Front],[3, Left],[5, Right]]],
+        [0x0403, 1/3, 0, [[3, Front],[4, Left],[1, Right]]],
+        [0x0405, 1/3, 0, [[5, Front],[1, Left],[4, Right]]],
+    ],
+    [ # state 5: action, prob, reward, [state, prob]
+        [0x0502, 1/2, Hole, [[2, Front],[4, Left],[5, Right]]],
+        [0x0504, 1/2, 0, [[4, Front],[5, Left],[2, Right]]],
+    ],
+
+]
+
+class DataParser(object):
+    def get_next_actions(self, curr_state):
+        actions_data = P[curr_state.value]
+        #print(actions_data)
+        return actions_data
+
+    def get_action_pi_reward(self, action_data):
+        return action_data[Action], action_data[ActionPi], action_data[Reward]
+    
+    def get_action_states_probs(self, action_data):
+        return action_data[StateProbs]
+
+    def get_next_states_probs(self, action):
+        for state in P:
+            for actions_data in state:
+                if (actions_data[Action] == action):
+                    return actions_data[Reward], actions_data[StateProbs]
+        return None, None
+
+
+
+def V_pi(States, dataParser, gamma):
+    num_state = 6
+    V_curr = [0.0] * num_state
+    V_next = [0.0] * num_state
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历所有状态 s
+        for curr_state in States:
+            v_curr_sum = 0
+            # 获得 状态->动作 策略概率
+            actions_data = dataParser.get_next_actions(curr_state)
+            # 遍历每个策略概率
+            for action_data in actions_data:
+                next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
+                # 获得 动作->状态 转移概率
+                next_states_probs = dataParser.get_action_states_probs(action_data)
+                #next_states_prob = P_as[action_value]
+                v_sum = 0
+                # 遍历每个转移概率
+                for [next_state_value, next_state_prob] in next_states_probs:
+                    # math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
+                    v_sum += next_state_prob * V_next[next_state_value]
+                #end for
+                # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] 
+                v_curr_sum += next_action_prob * (reward + gamma * v_sum)
+            # end for
+            V_curr[curr_state.value] = v_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next 迭代
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
+
+def Q2_pi(Actions, dataParser, gamma, vs):
+    Q = {}
+    # 遍历每个action
+    for curr_action in Actions:
+        q_sum = 0
+        # 获得 动作->状态 转移概率
+        reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
+        if (reward is None):
+            continue
+        #next_states_probs = P_as[curr_action.value]
+        # 遍历每个转移概率求和
+        for [next_state_value, next_state_prob] in next_states_probs:
+            # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') 
+            q_sum += next_state_prob * vs[next_state_value]
+        # end for
+        # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
+        q = reward + gamma * q_sum
+        Q[curr_action.name] = q
+    #endfor
+    return Q
+
+
+if __name__=="__main__":
+    gamma = 0.9
+    dataParser = DataParser()
+    vs = V_pi(States, dataParser, gamma)
+    print(np.round(np.array(vs).reshape(2,3), 2))
+    Q = Q2_pi(Actions, dataParser, gamma, vs)
+    for q in Q:
+        print(q, "={:.4f}".format(Q[q]))
+
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py
@ -0,0 +1,142 @@
+import Data_FrozenLake2 as dfl2
+import numpy as np
+
+def V_star(States, dataParser, gamma):
+    num_state = len(States)
+    V_curr = [0.0] * num_state
+    V_next = [0.0] * num_state
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历所有状态 s
+        for curr_state in States:
+            list_v = []
+            # 获得 状态->动作 策略概率
+            next_actions_datas = dataParser.get_next_actions(curr_state.value)
+            for next_action_data in next_actions_datas:
+                next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(next_action_data)
+
+                # 获得 动作->状态 转移概率
+                next_states_probs = dataParser.get_action_states_probs(next_action_data)
+                #next_states_prob = P_as[action_value]
+                v_sum = 0
+                # 遍历每个转移概率
+                for [next_state_value, next_state_prob] in next_states_probs:
+                    # math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
+                    v_sum += next_state_prob * V_next[next_state_value]
+                #end for
+                # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] 
+                list_v.append(reward + gamma * v_sum)
+            # end for
+            if (len(list_v) > 0):
+                V_curr[curr_state.value] = max(list_v)
+        #endfor
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
+
+def Q_star(Actions, dataParser, gamma):
+    num_action = len(Actions)
+    Q_curr = [0.0] * num_action
+    Q_next = [0.0] * num_action
+    count = 0
+    # 迭代
+    while (count < 100):
+        # 遍历每个action
+        for curr_action in Actions:
+            q_curr_sum = 0
+            # 获得 动作->状态 转移概率
+            reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
+            if (reward is None):
+                continue
+            # 遍历每个转移概率求和
+            for [next_state_value, next_state_prob] in next_states_probs:
+                # 获得 状态->动作 策略概率
+                actions_datas = dataParser.get_next_actions(next_state_value)
+                list_q = []
+                # 求最大值
+                for action_data in actions_datas:
+                    action, _, _ = dataParser.get_action_pi_reward(action_data)
+                    list_q.append(Q_next[action])
+                #end for
+                # math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') 
+                if (len(list_q) > 0):
+                    q_curr_sum += next_state_prob * max(list_q) 
+            # end for
+            # math: R_s^a + \gamma  ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
+            Q_curr[curr_action.value] = reward + gamma * q_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(Q_next, Q_curr):
+            break
+        # 把 Q_curr 赋值给 Q_next
+        Q_next = Q_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return Q_next
+
+
+def Q_star_from_V_star(Actions, dataParser, gamma, v_star):
+    Q_star = {}
+    # 遍历每个action
+    for curr_action in Actions:
+        q_sum = 0
+        # 获得 动作->状态 转移概率
+        reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
+        if (reward is None):
+            continue
+        # 遍历每个转移概率求和
+        for [next_state_value, next_state_prob] in next_states_probs:
+        # math: \sum_{a'} P_{ss'}^a v_{*}(s') 
+            q_sum += next_state_prob * v_star[next_state_value]
+        # end for
+        # math: R_s^a + \gamma  ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') )
+        Q_star[curr_action.name] = reward + gamma * q_sum
+    #endfor
+    return sorted(Q_star.items())
+
+
+
+def find_next_best(Q, start):
+    action = None
+    value = None
+    for q in Q:
+        if (q[0].startswith(start)):
+            if action is None:
+                action = q[0]
+                value = q[1]
+            else:
+                if (q[1] > value):
+                    action = q[0]
+                    value = q[1]
+    return action, value
+    
+
+if __name__=="__main__":
+    gamma = 0.9
+    dataParser = dfl2.DataParser()
+    vs = V_star(dfl2.States, dataParser, gamma)
+    print(np.round(np.array(vs).reshape(4,4), 2))
+    
+    Q_star = Q_star_from_V_star(dfl2.Actions, dataParser, gamma, vs)
+    for q in Q_star:
+        print(q)
+
+    start = "a00"
+    count = 0
+    while(True):
+        action, value = find_next_best(Q_star, start)
+        print(action, value)
+        if (action is None):
+            break
+        start = "a" + action.replace(start, "")
+        count +=1
+        if (count > 8):
+            break
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py
@ -0,0 +1,73 @@
+import Data_FrozenLake2 as dfl2
+
+import numpy as np
+
+def V_pi(States, dataParser, gamma):
+    num_state = 16
+    V_curr = [0.0] * num_state
+    V_next = [0.0] * num_state
+    count = 0
+    # 迭代
+    while (True):
+        # 遍历所有状态 s
+        for curr_state in States:
+            v_curr_sum = 0
+            # 获得 状态->动作 策略概率
+            actions_data = dataParser.get_next_actions(curr_state)
+            # 遍历每个策略概率
+            for action_data in actions_data:
+                next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data)
+                # 获得 动作->状态 转移概率
+                next_states_probs = dataParser.get_action_states_probs(action_data)
+                #next_states_prob = P_as[action_value]
+                v_sum = 0
+                # 遍历每个转移概率
+                for [next_state_value, next_state_prob] in next_states_probs:
+                    # math: \sum_{s'} P_{ss'}^a v_{\pi}(s')
+                    v_sum += next_state_prob * V_next[next_state_value]
+                #end for
+                # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] 
+                v_curr_sum += next_action_prob * (reward + gamma * v_sum)
+            # end for
+            V_curr[curr_state.value] = v_curr_sum
+        #endfor
+        # 检查收敛性
+        if np.allclose(V_next, V_curr):
+            break
+        # 把 V_curr 赋值给 V_next 迭代
+        V_next = V_curr.copy()
+        count += 1
+    # end while
+    print(count)
+    return V_next
+
+def Q2_pi(Actions, dataParser, gamma, vs):
+    Q = {}
+    # 遍历每个action
+    for curr_action in Actions:
+        q_sum = 0
+        # 获得 动作->状态 转移概率
+        reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value)
+        if (reward is None):
+            continue
+        #next_states_probs = P_as[curr_action.value]
+        # 遍历每个转移概率求和
+        for [next_state_value, next_state_prob] in next_states_probs:
+            # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') 
+            q_sum += next_state_prob * vs[next_state_value]
+        # end for
+        # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s')
+        q = reward + gamma * q_sum
+        Q[curr_action.name] = q
+    #endfor
+    return Q
+
+
+if __name__=="__main__":
+    gamma = 0.9
+    dataParser = dfl2.DataParser()
+    vs = V_pi(dfl2.States, dataParser, gamma)
+    print(np.round(np.array(vs).reshape(4,4), 2))
+    Q = Q2_pi(dfl2.Actions, dataParser, gamma, vs)
+    for q in Q:
+        print(q, "={:.4f}".format(Q[q]))
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py
@ -0,0 +1,24 @@
+import Algorithm_MDP_Star as algoMS
+import Data_Students2 as ds2
+
+def Student_V_star(gamma):
+    v = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    for start_state in ds2.States:
+        print(start_state, "= {:.1f}".format(v[start_state.value]))
+
+def Student_Q_star(gamma):
+    v = algoMS.Q_star(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    for action in ds2.Actions:
+        print(action, "= {:.1f}".format(v[action.value]))
+
+def Student_Q_from_V_star(gamma):
+    v_star = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    q_star = algoMS.Q_star_from_V_star(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_star)
+    for action in ds2.Actions:
+        print(action, "= {:.1f}".format(q_star[action.value]))
+
+if __name__=="__main__":
+    gamma = 1
+    Student_V_star(gamma)
+    Student_Q_star(gamma)
+    Student_Q_from_V_star(gamma)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py
@ -0,0 +1,25 @@
+import Data_Students2 as ds2
+import Algorithm_MDP as mba
+
+def Student_V_Pi(gamma):
+    v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    for state in ds2.States:
+        print(state, "= {:.1f}".format(v_pi[state.value]))
+    return v_pi
+
+def Student_Q_Pi(gamma):
+    q_pi = mba.Q_pi(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    for action in ds2.Actions:
+        print(action, "= {:.1f}".format(q_pi[action.value]))
+
+def Student_Q_Pi_From_V_Pi(gamma):
+    v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma)
+    q_pi = mba.Q_pi_from_V_pi(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_pi)
+    for action in ds2.Actions:
+        print(action, "= {:.1f}".format(q_pi[action.value]))
+
+if __name__=="__main__":
+    gamma = 1
+    Student_V_Pi(gamma)
+    Student_Q_Pi(gamma)
+    Student_Q_Pi_From_V_Pi(gamma)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py
@ -0,0 +1,27 @@
+import numpy as np
+import Algorithm_MPR as algoM
+import Data_FrozenLake as dfl
+
+
+def FrozenLake_MentoCarol(gamma):
+    episodes = 20000
+    end_states = [dfl.States.Hole2, dfl.States.Hole8, dfl.States.Hole10, dfl.States.Goal15]
+    vs = algoM.MonteCarol(dfl.Rewards, dfl.Matrix, dfl.States, end_states, gamma, episodes)
+    print(np.round(np.array(vs).reshape(4,4), 2))
+
+def FrozenLake_Matrix(gamma):
+    vs = algoM.Matrix(dfl, gamma)
+    print(np.round(np.array(vs).reshape(4,4), 2))
+
+def FrozenLake_Bellman(gamma):
+    vs = algoM.Bellman(dfl.States, dfl.Matrix, dfl.Rewards, gamma)
+    np.set_printoptions(suppress=True)
+    print(np.round(np.array(vs).reshape(4,4), 2))
+
+
+if __name__=="__main__":
+    gamma = 1
+    print(gamma)
+    #FrozenLake_MentoCarol(gamma)
+    FrozenLake_Matrix(gamma)
+    FrozenLake_Bellman(gamma)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py
@ -0,0 +1,30 @@
+import Data_Student as ds
+import Algorithm_MPR as algoM
+import numpy as np
+
+
+def Student_MonteCarol(gamma):
+    episodes = 10000
+    end_states = [ds.States.Sleep]
+    v = algoM.MonteCarol(ds.Rewards, ds.Matrix, ds.States, end_states, gamma, episodes)
+    for start_state in ds.States:
+        print(start_state, "= {:.2f}".format(v[start_state.value]))
+
+
+def InvMatrix(gamma):
+    v = algoM.Matrix(ds, gamma)
+    for start_state in ds.States:
+        print(start_state, "= {:.2f}".format(v[start_state.value]))
+    return v
+
+def Bellman(gamma):
+    v = algoM.Bellman(ds.States, ds.Matrix, ds.Rewards, gamma)
+    for start_state in ds.States:
+        print(start_state, "= {:.2f}".format(v[start_state.value]))
+
+
+if __name__=="__main__":
+    gamma = 0.9
+    #Student_MonteCarol(gamma)
+    InvMatrix(gamma)
+    Bellman(gamma)
--- a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/init.py
+++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/init.py