From 0c91801d6e0899e39d94297ba6a7e027c0b4f147 Mon Sep 17 00:00:00 2001 From: xiaowuhu Date: Mon, 7 Feb 2022 13:46:19 +0800 Subject: [PATCH] Xiaowu/20220123 (#712) * update * Update MRP-1.py * up * uo * uui * up * update * oo * ui * ui * Update MDP_FrozenLake_Optimal.py --- .../src/ThreeDoors.py | 0 .../{02-多臂赌博机 => 02-探索与利用}/formula.md | 0 .../imges/E-Greedy.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/Greedy.png | Bin .../imges/Opt-Init-1.png | Bin .../imges/Opt-Init-2.png | Bin .../imges/Opt-Init-3.png | Bin .../imges/Thompson.png | Bin .../imges/compare_1.png | Bin .../imges/compare_2.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/random.png | Bin .../imges/softmax-1.png | Bin .../imges/softmax-2.png | Bin .../imges/softmax-3.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-1.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-10.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-2.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-3.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-4.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-5.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-6.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-7.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-8.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/tps-9.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/imges/ucb.png | Bin .../{02-多臂赌博机 => 02-探索与利用}/src/__init__.py | 0 .../{02-多臂赌博机 => 02-探索与利用}/src/armit.py | 0 .../src/bandit_20_base.py | 0 .../src/bandit_21_random.py | 0 .../src/bandit_22_greedy.py | 0 .../src/bandit_23_e_greedy.py | 0 .../src/bandit_24_optimistic_initial.py | 0 .../src/bandit_25_softmax.py | 0 .../src/bandit_25_softmax_test.py | 0 .../src/bandit_26_UCB.py | 0 .../src/bandit_26_ucb_test.py | 0 .../src/bandit_27_thompson.py | 0 .../src/bandit_27_thompson_test.py | 0 .../src/bandit_28_all.py | 0 .../{02-多臂赌博机 => 02-探索与利用}/src/test.py | 0 .../A7-强化学习/03-马尔可夫决策过程/formula.md | 299 ++++++++++++++++++ .../03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py | 97 ++++++ .../03-马尔可夫决策过程/src/Algorithm_MDP_Star.py | 106 +++++++ .../03-马尔可夫决策过程/src/Algorithm_MPR.py | 88 ++++++ .../03-马尔可夫决策过程/src/Data_FrozenLake.py | 115 +++++++ .../03-马尔可夫决策过程/src/Data_FrozenLake2.py | 223 +++++++++++++ .../03-马尔可夫决策过程/src/Data_Student.py | 30 ++ .../03-马尔可夫决策过程/src/Data_Students2.py | 59 ++++ .../03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py | 175 ++++++++++ .../03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py | 142 +++++++++ .../03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py | 73 +++++ .../03-马尔可夫决策过程/src/MDP_Student_Optimal.py | 24 ++ .../03-马尔可夫决策过程/src/MDP_Student_ValuePI.py | 25 ++ .../03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py | 27 ++ .../03-马尔可夫决策过程/src/MPR_Student_Value.py | 30 ++ .../03-马尔可夫决策过程/src/__init__.py | 0 56 files changed, 1513 insertions(+) rename 基础教程/A7-强化学习/{01-从OR到RL => 01-优化与强化学习}/src/ThreeDoors.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/formula.md (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/E-Greedy.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/Greedy.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/Opt-Init-1.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/Opt-Init-2.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/Opt-Init-3.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/Thompson.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/compare_1.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/compare_2.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/random.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/softmax-1.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/softmax-2.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/softmax-3.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-1.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-10.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-2.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-3.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-4.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-5.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-6.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-7.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-8.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/tps-9.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/imges/ucb.png (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/__init__.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/armit.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_20_base.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_21_random.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_22_greedy.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_23_e_greedy.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_24_optimistic_initial.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_25_softmax.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_25_softmax_test.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_26_UCB.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_26_ucb_test.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_27_thompson.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_27_thompson_test.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/bandit_28_all.py (100%) rename 基础教程/A7-强化学习/{02-多臂赌博机 => 02-探索与利用}/src/test.py (100%) create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py create mode 100644 基础教程/A7-强化学习/03-马尔可夫决策过程/src/__init__.py diff --git a/基础教程/A7-强化学习/01-从OR到RL/src/ThreeDoors.py b/基础教程/A7-强化学习/01-优化与强化学习/src/ThreeDoors.py similarity index 100% rename from 基础教程/A7-强化学习/01-从OR到RL/src/ThreeDoors.py rename to 基础教程/A7-强化学习/01-优化与强化学习/src/ThreeDoors.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/formula.md b/基础教程/A7-强化学习/02-探索与利用/formula.md similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/formula.md rename to 基础教程/A7-强化学习/02-探索与利用/formula.md diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/E-Greedy.png b/基础教程/A7-强化学习/02-探索与利用/imges/E-Greedy.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/E-Greedy.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/E-Greedy.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/Greedy.png b/基础教程/A7-强化学习/02-探索与利用/imges/Greedy.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/Greedy.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/Greedy.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-1.png b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-1.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-1.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-1.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-2.png b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-2.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-2.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-2.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-3.png b/基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-3.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/Opt-Init-3.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/Opt-Init-3.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/Thompson.png b/基础教程/A7-强化学习/02-探索与利用/imges/Thompson.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/Thompson.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/Thompson.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/compare_1.png b/基础教程/A7-强化学习/02-探索与利用/imges/compare_1.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/compare_1.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/compare_1.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/compare_2.png b/基础教程/A7-强化学习/02-探索与利用/imges/compare_2.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/compare_2.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/compare_2.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/random.png b/基础教程/A7-强化学习/02-探索与利用/imges/random.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/random.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/random.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-1.png b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-1.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-1.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/softmax-1.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-2.png b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-2.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-2.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/softmax-2.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-3.png b/基础教程/A7-强化学习/02-探索与利用/imges/softmax-3.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/softmax-3.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/softmax-3.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-1.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-1.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-1.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-1.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-10.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-10.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-10.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-10.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-2.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-2.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-2.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-2.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-3.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-3.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-3.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-3.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-4.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-4.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-4.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-4.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-5.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-5.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-5.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-5.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-6.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-6.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-6.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-6.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-7.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-7.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-7.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-7.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-8.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-8.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-8.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-8.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/tps-9.png b/基础教程/A7-强化学习/02-探索与利用/imges/tps-9.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/tps-9.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/tps-9.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/imges/ucb.png b/基础教程/A7-强化学习/02-探索与利用/imges/ucb.png similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/imges/ucb.png rename to 基础教程/A7-强化学习/02-探索与利用/imges/ucb.png diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/__init__.py b/基础教程/A7-强化学习/02-探索与利用/src/__init__.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/__init__.py rename to 基础教程/A7-强化学习/02-探索与利用/src/__init__.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/armit.py b/基础教程/A7-强化学习/02-探索与利用/src/armit.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/armit.py rename to 基础教程/A7-强化学习/02-探索与利用/src/armit.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_20_base.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_20_base.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_20_base.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_20_base.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_21_random.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_21_random.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_21_random.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_21_random.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_22_greedy.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_22_greedy.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_22_greedy.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_22_greedy.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_23_e_greedy.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_23_e_greedy.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_23_e_greedy.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_23_e_greedy.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_24_optimistic_initial.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_24_optimistic_initial.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_24_optimistic_initial.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_24_optimistic_initial.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_25_softmax.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_25_softmax.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_25_softmax_test.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax_test.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_25_softmax_test.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_25_softmax_test.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_26_UCB.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_UCB.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_26_UCB.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_26_UCB.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_26_ucb_test.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_26_ucb_test.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_26_ucb_test.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_26_ucb_test.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_27_thompson.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_27_thompson.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_27_thompson_test.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson_test.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_27_thompson_test.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_27_thompson_test.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/bandit_28_all.py b/基础教程/A7-强化学习/02-探索与利用/src/bandit_28_all.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/bandit_28_all.py rename to 基础教程/A7-强化学习/02-探索与利用/src/bandit_28_all.py diff --git a/基础教程/A7-强化学习/02-多臂赌博机/src/test.py b/基础教程/A7-强化学习/02-探索与利用/src/test.py similarity index 100% rename from 基础教程/A7-强化学习/02-多臂赌博机/src/test.py rename to 基础教程/A7-强化学习/02-探索与利用/src/test.py diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md b/基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md new file mode 100644 index 00000000..4bda631d --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/formula.md @@ -0,0 +1,299 @@ + +$$ +p(s'|s,a) = \Pr \{S_t=s'|S_{t-1}=s,A_{t-1}=a\} +$$ + +$$ +\sum_{i=0}^n p(s_i'|s_j,a) = 1 +$$ + +$$ +p(s_1'|s_j,a) + p(s_2'|s_j,a) + p(s_3'|s_j,a) = 1 +$$ + +$$ +P = +\begin{bmatrix} +p(s_1|s_1) & p(s_1|s_2) & \cdots & p(s_1|s_n) +\\ +p(s_2|s_1) & p(s_2|s_2) & \cdots & p(s_2|s_n) +\\ +\vdots & \vdots & \ddots & \vdots +\\ +p(s_n|s_1) & p(s_n|s_2) & \cdots & p(s_n|s_n) +\end{bmatrix} +$$ + +奖励函数 + +$$ +R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s,A_{t-1}=a ] +$$ + +$$ +R(s)=\mathbb {E} \ [R_{t} \ | \ S_{t-1}=s ] +$$ + + +$$ +\begin{aligned} +G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots + \gamma^{T-t-1} R_{T} +\\ +&= \sum_{k=0}^{T} \gamma^k R_{t+k+1}, \ 0 \le \gamma \le 1 +\end{aligned} +$$ + +$$ +\begin{aligned} +G_t &= R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \gamma^3 R_{t+4} + \cdots +\\ +&= R_{t+1} + \gamma (R_{t+2} + \gamma R_{t+3} + \gamma^{2} R_{t+4}+\cdots) +\\ +&=R_{t+1} + \gamma G_{t+1} +\end{aligned} +$$ + +$$ +R_s = \mathbb{E} [R_{t+1} | S_t=s] +$$ + +$$ +\begin{aligned} +V(s) &= \mathbb{E} [G_t \ | \ S_t=s] +\\ +&=\mathbb{E} [R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + \cdots \ | \ S_t=s] +\\ +&=\mathbb{E} [R_{t+1} + \gamma G_{t+1} \ | \ S_t=s] +\\ +&=\mathbb{E} [R_{t+1}] + \gamma \mathbb{E} [G_{t+1}|S_t=s] +\\ +&=R_{t+1} + \gamma V(s_{t+1}) +\end{aligned} +$$ + +$$ +V(Class3) = 4.09 +\\ +\begin{aligned} +X&=R_{Class3}+\gamma*[V(Pub)*P(S_{Class3}|S_{Pub}) +\\ +&+ V(A_{Pass})*P(S_{Class3}|S_{A_{Pass}})] +\\ +&=(-2)+0.9*(1.93*0.4+10*0.6)=4.09 +\end{aligned} +\\ +V(Class3) == X +$$ + +$$ +V(s) = R_s + \gamma * \sum V(s') P(s|s') +$$ + +$$ +V(s)=R_s + \gamma \sum_{s' \in S} Pss' \cdot V(s') +$$ + +$$ +V(s)=R_s + \gamma * [p_1V(s'_1) + p_2V(s'_2) + p_3V(s'_3)] +$$ + +矩阵形式 + +$$ +V = R + \gamma PV +$$ + +$$ +\begin{bmatrix} +V(1) +\\ +V(2) +\\ +\vdots +\\ +V(n) +\end{bmatrix} +=\ +\begin{bmatrix} +R_1 +\\ +R_2 +\\ +\vdots +\\ +R_n +\end{bmatrix} ++\gamma +\begin{bmatrix} +P_{11} & P_{12} & \cdots & P_{1n} +\\ +P_{21} & P_{22} & \cdots & P_{2n} +\\ +\vdots & \vdots & \ddots & \vdots +\\ +P_{n1} & P_{n2} & \cdots & P_{nn} +\end{bmatrix} +\begin{bmatrix} +V(1) +\\ +V(2) +\\ +\vdots +\\ +V(n) +\end{bmatrix} +$$ + +$$ +V - \gamma PV = R +\\ +(1-\gamma {})V = R +\\ +V = (I - \gamma P)^{-1} R +$$ + +策略价值函数 + +$$ +v_{\pi}(s)=\mathbb {E}_{\pi} [ G_t |S_t=s] +$$ + +$$ +\begin{aligned} +v_{\pi}(s)&=\sum_{a \in A} \pi(a|s) q_\pi(s,a) +\\ +&=\pi(a_1|s) q_{\pi}(s,a_1)+\pi(a_2|s) q_{\pi}(s,a_2)+\pi(a_3|s) q_{\pi}(s,a_3) +\end{aligned} +$$ + +策略动作函数 + +$$ +q_{\pi}(s,a)=\mathbb E_{\pi} [G_t | S_t=s, A_t=a] +$$ + + +$$ +\begin{aligned} +q_{\pi}(s,a)&=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') +\\ +&= R_s^a + \gamma [P_1 v_{\pi}(s'_1)+P_2 v_{\pi}(s'_2)] +\end{aligned} +$$ + +$$ +v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big] +$$ + +$$ +q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} \sum_{a' \in A} \pi(a'|s') q_\pi(s',a') +$$ + +$$ +\begin{aligned} +V_1 &= \pi(A_{Play}|S_{V_1})*(R_{Play}+\gamma P_{11}V_1)+\pi(A_{Quit}|S_{V_2})*(R_{Quit}+\gamma P_{12}V_2) +\\ +V_2 &= \pi(A_{Play}|S_{V_2})*(R_{Play}+\gamma P_{21}V_1)+\pi(A_{Study1}|S_{V_2})*(R_{Study1}+\gamma P_{23}V_3) +\\ +V_3 &= \pi(Sleep|S_{V_3})*(R_{Sleep}+\gamma P_{30}V_0)+\pi(A_{Study2}|S_{V_3})*(R_{Study2}+\gamma P_{34}V_4) +\\ +V_4 &= \pi(A_{Pass}|S_{V_4})*(R_{Pass}+\gamma P_{40}V_0)+\pi(A_{Pub}|S_{V_4})*(R_{Pub}+\gamma P_{42}V_2+\gamma P_{43}V_3+\gamma P_{44}V_4) +\end{aligned} +$$ + +根据公式 + +$$ +V_*(s) = \underset{a}{\max} [R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s')] +$$ + +$$ +\gamma=1 +\\ +V_{Rest}=V0=0 +\\ +V_{Game} = V1 = \max (-1+V1, 0+V2) +\\ +V_{Class1}=V2=\max (-1+V1, -2+V3) +\\ +V_{Class2}=V3 = \max (0 + V0, -2+V4) +\\ +V_{Class3} = V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3) +$$ + +解上述方程组 +由于 +$$ +x = \max (x+a, b) +$$ + +其中 a,b为常数时,可以推论 $x=b$。所以 + +$$ +V1 = \max (-1+V1, 0+V2)=V2 +\\ +V2=\max (-1+V1, -2+V3)=\max (-1+V2, -2+V3)=V3-2=V1 +\\ +V3 = V1+2 +\\ +V3 = \max (0 + V0, -2+V4)=V4-2 +\\ +V4 = V3+2=V1+4 +\\ +V4 = \max (10+V0, 1+0.2V1+0.4V2+0.4V3) +$$ + +把所有的变量都换成V1 +$$ +V4 = \max (10, 1+0.2V1+0.4V1+0.4(V1+2))=\max (10, V1+1.8)=\max(10,V4-2.2) +$$ + +所以 +$$ +V4=10 +\\ +V1=V4-4=6 +\\ +V2=V1=6 +\\ +V3=V1+2=8 +$$ + +根据 + + +$$ +v_{\pi}(s)=\sum_{a \in A} \pi(a|s)\Big[ R_s^a + \gamma \sum_{s' \in S} P^a_{ss'} v_{\pi}(s') \Big] +$$ + +$$ +\gamma=1 +$$ + +$$ +V0=0 +\\ +V1=0.5(-1+1*1*V1)+0.5(0+1*1*V2)=0.5V1+0.5V2-0.5 +\\ +V2=0.5(-1+1*1*V1)+0.5(-2+1*1*V3)=0.5V1+0.5V3-1.5 +\\ +V3=0.5(0+1*1*0)+0.5(-2+1*1*V4)=0.5V4-1 +\\ +V4=0.5(10+1*1*0)+0.5(1+1*0.2*V2+1*0.4*V3+1*0.4*V4)=0.1V2+0.2V3+0.2V4+5.5 +$$ + +解方程组得到 + +$$ +V3=2.7 +\\ +V2=-1.3 +\\ +V1=-2.3 +\\ +V4=7.4 +$$ + +$$ +Q_*(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^aV_*(s') +$$ \ No newline at end of file diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py new file mode 100644 index 00000000..ad86b097 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Pi.py @@ -0,0 +1,97 @@ +import numpy as np + +def V_pi(States, Pi_sa, P_as, Rewards, gamma): + num_state = len(States) + V_curr = [0.0] * num_state + V_next = [0.0] * num_state + count = 0 + # 迭代 + while (True): + # 遍历所有状态 s + for curr_state in States: + v_curr_sum = 0 + # 获得 状态->动作 策略概率 + next_actions_prob = Pi_sa[curr_state.value] + # 遍历每个策略概率 + for action_value, action_prob in enumerate(next_actions_prob): + # 获得 动作->状态 转移概率 + next_states_prob = P_as[action_value] + v_sum = 0 + # 遍历每个转移概率 + for state_value, state_prob in enumerate(next_states_prob): + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + v_sum += state_prob * V_next[state_value] + #end for + # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] + v_curr_sum += action_prob * (Rewards[action_value] + gamma * v_sum) + # end for + + V_curr[curr_state.value] = v_curr_sum + #endfor + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next + + +def Q_pi(Actions, Pi_sa, P_as, Rewards, gamma): + num_action = len(Actions) + Q_curr = [0.0] * num_action + Q_next = [0.0] * num_action + count = 0 + # 迭代 + while (True): + # 遍历每个action + for curr_action in Actions: + q_curr_sum = 0 + # 获得 动作->状态 转移概率 + next_states_prob = P_as[curr_action.value] + # 遍历每个转移概率求和 + for state_value, state_prob in enumerate(next_states_prob): + # 获得 状态->动作 策略概率 + next_actions_prob = Pi_sa[state_value] + q_sum = 0 + # 遍历每个动作概率求和 + for action_value, action_prob in enumerate(next_actions_prob): + # math: \sum_{a'} \pi(a'|s')*q_{\pi}(s',a') + q_sum += action_prob * Q_next[action_value] + #end for + # math: \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') ) + q_curr_sum += state_prob * q_sum + # end for + # math: q_{\pi}(s,a)=R_s^a + \sum_{s'} P_{ss'}^a ( \sum_{a'} \pi(a'|s')q_{\pi}(s',a') ) + Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum + #endfor + # 检查收敛性 + if np.allclose(Q_next, Q_curr): + break + # 把 Q_curr 赋值给 Q_next + Q_next = Q_curr.copy() + count += 1 + # end while + print(count) + return Q_next + + +def Q_pi_from_V_pi(Actions, P_as, Rewards, gamma, v_pi): + num_action = len(Actions) + Q = [0.0] * num_action + # 遍历每个action + for curr_action in Actions: + q_sum = 0 + # 获得 动作->状态 转移概率 + next_states_probs = P_as[curr_action.value] + # 遍历每个转移概率求和 + for next_state_value, next_state_prob in enumerate(next_states_probs): + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + q_sum += next_state_prob * v_pi[next_state_value] + # end for + # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s') + Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum + #endfor + return Q diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py new file mode 100644 index 00000000..8b63ffa8 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MDP_Star.py @@ -0,0 +1,106 @@ +import numpy as np + +# state value function +def V_star(States, Pi_sa, P_as, Rewards, gamma): + num_state = len(States) + V_curr = [0.0] * num_state + V_next = [0.0] * num_state + count = 0 + # 迭代 + while (True): + # 遍历所有状态 s + for curr_state in States: + list_v = [] + # 获得 状态->动作 策略概率 + next_actions_probs = Pi_sa[curr_state.value] + # 遍历每个策略概率 + for action_value, action_prob in enumerate(next_actions_probs): + if (action_prob > 0.0): + # 获得 动作->状态 转移概率 + next_states_probs = P_as[action_value] + v_sum = 0 + # 遍历每个转移概率 + for state_value, state_prob in enumerate(next_states_probs): + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + v_sum += state_prob * V_next[state_value] + #end for + # math: \max [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] + list_v.append(Rewards[action_value] + gamma * v_sum) + # end for + if (len(list_v) > 0): + V_curr[curr_state.value] = max(list_v) + #endfor + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next + +# action value function +def Q_star(Actions, Pi_sa, P_as, Rewards, gamma): + num_action = print(len(Actions)) + Q_curr = [0.0] * num_action + Q_next = [0.0] * num_action + count = 0 + # 迭代 + while (count < 100): + # 遍历每个action + for curr_action in Actions: + q_curr_sum = 0 + if (curr_action == Actions.Sleep): + continue + # 获得 动作->状态 转移概率 + next_states_probs = P_as[curr_action.value] + # 遍历每个转移概率求和 + for state_value, state_prob in enumerate(next_states_probs): + # 获得 状态->动作 策略概率 + next_actions_probs = Pi_sa[state_value] + list_q = [] + # 遍历每个动作概率求和 + for next_action_value, next_action_prob in enumerate(next_actions_probs): + if (next_action_prob > 0.0): + # math: q_{\pi}(s',a') + list_q.append(Q_next[next_action_value]) + #end for + # math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') + if (len(list_q) > 0): + q_curr_sum += state_prob * max(list_q) + # end for + # math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') ) + Q_curr[curr_action.value] = Rewards[curr_action.value] + gamma * q_curr_sum + #endfor + # 检查收敛性 + if np.allclose(Q_next, Q_curr): + break + # 把 Q_curr 赋值给 Q_next + Q_next = Q_curr.copy() + count += 1 + # end while + print(count) + return Q_next + + +# math: q_*(s,a) = R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_*(s') +def Q_star_from_V_star(Actions, P_as, Rewards, gamma, v_star): + num_action = print(len(Actions)) + Q = [0.0] * num_action + # 遍历每个action + for curr_action in Actions: + q_sum = 0 + if (curr_action == Actions.Sleep): + continue + # 获得 动作->状态 转移概率 + next_states_probs = P_as[curr_action.value] + # 遍历每个转移概率求和 + for next_state_value, next_state_prob in enumerate(next_states_probs): + # math: \sum_{a'} P_{ss'}^a v_{*}(s') + q_sum += next_state_prob * v_star[next_state_value] + # end for + # math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') ) + Q[curr_action.value] = Rewards[curr_action.value] + gamma * q_sum + #endfor + return Q diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py new file mode 100644 index 00000000..90cba6f2 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Algorithm_MPR.py @@ -0,0 +1,88 @@ +import math +import numpy as np +import tqdm +import multiprocessing as mp + +def mc_single_process( + Rewards, TransMatrix, States, + start_state, end_states, episodes, gamma): + num_state = len(Rewards) + sum_gain = 0 + for episode in tqdm.trange(episodes): + if (start_state in end_states): + # 最后一个状态也可能有reward值 + return Rewards[start_state.value] + curr_state_value = start_state.value + gain = Rewards[curr_state_value] + power = 1 + while (True): + next_state_value = np.random.choice( + num_state, p=TransMatrix[curr_state_value]) + r = Rewards[next_state_value] + gain += math.pow(gamma, power) * r + if (States(next_state_value) in end_states): + # 到达终点,分幕结束 + break + else: + power += 1 + curr_state_value = next_state_value + # end while + sum_gain += gain + # end for + v = sum_gain / episodes + return v + +# 蒙特卡洛采样法 +def MonteCarol(Rewards, TransMatrix, States, end_states, gamma, episodes): + pool = mp.Pool(processes=6) + Vs = [] + results = [] + for start_state in States: + results.append(pool.apply_async(mc_single_process, + args=(Rewards, TransMatrix, States, start_state, end_states, episodes, gamma,))) + pool.close() + pool.join() + for i in range(len(results)): + v = results[i].get() + Vs.append(v) + + return Vs + +# 矩阵法 +def Matrix(ds, gamma): + num_state = ds.Matrix.shape[0] + I = np.eye(num_state) + tmp1 = I - gamma * ds.Matrix + tmp2 = np.linalg.inv(tmp1) + vs = np.dot(tmp2, ds.Rewards) + + return vs + +# 贝尔曼方程迭代 +def Bellman(States, TransMatrix, Rewards, gamma): + num_states = len(Rewards) + V_curr = [0.0] * num_states + V_next = [0.0] * num_states + count = 0 + while (count < 1000): + # 遍历每一个 state 作为 start_state + for start_state in States: + # 得到转移概率 + next_states_probs = TransMatrix[start_state.value] + v_sum = 0 + # 计算下一个状态的 转移概率*状态值 的 和 v + for next_state_value, next_state_prob in enumerate(next_states_probs): + # if (prob[next_state] > 0.0): + v_sum += next_state_prob * V_next[next_state_value] + # end for + V_curr[start_state.value] = Rewards[start_state.value] + gamma * v_sum + # end for + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py new file mode 100644 index 00000000..0b7b3856 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake.py @@ -0,0 +1,115 @@ +import numpy as np +from enum import Enum + +# 状态 +class States(Enum): + Start = 0 + Safe1 = 1 + Hole2 = 2 + Safe3 = 3 + Safe4 = 4 + Safe5 = 5 + Safe6 = 6 + Safe7 = 7 + Hole8 = 8 + Safe9 = 9 + Hole10 = 10 + Safe11 = 11 + Safe12 = 12 + Safe13 = 13 + Safe14 = 14 + Goal15 = 15 + +# Reward +Hole = -1 +Goal = 5 + +# 状态奖励 +Rewards = [0, 0, Hole, 0, + 0, 0, 0, 0, + Hole, 0, Hole, 0, + 0, 0, 0, Goal] + +Matrix = np.array( + [ + [0.0, 1/2, 0.0, 0.0, + 1/2, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 0 + + [1/3, 0.0, 1/3, 0.0, + 0.0, 1/3, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 1 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 2 + + [0.0, 0.0, 1/2, 0.0, + 0.0, 0.0, 0.0, 1/2, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 3 + + [1/3, 0.0, 0.0, 0.0, + 0.0, 1/3, 0.0, 0.0, + 1/3, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 4 + + [0.0, 1/4, 0.0, 0.0, + 1/4, 0.0, 1/4, 0.0, + 0.0, 1/4, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 5 + + [0.0, 0.0, 1/4, 0.0, + 0.0, 1/4, 0.0, 1/4, + 0.0, 0.0, 1/4, 0.0, + 0.0, 0.0, 0.0, 0.0], # 6 + + [0.0, 0.0, 0.0, 1/3, + 0.0, 0.0, 1/3, 0.0, + 0.0, 0.0, 0.0, 1/3, + 0.0, 0.0, 0.0, 0.0], # 7 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 8 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 1/4, 0.0, 0.0, + 1/4, 0.0, 1/4, 0.0, + 0.0, 1/4, 0.0, 0.0], # 9 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 10 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1/3, + 0.0, 0.0, 1/3, 0.0, + 0.0, 0.0, 0.0, 1/3], # 11 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 1/2, 0.0, 0.0, 0.0, + 0.0, 1/2, 0.0, 0.0], # 12 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 1/3, 0.0, 0.0, + 1/3, 0.0, 1/3, 0.0], # 13 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1/3, 0.0, + 0.0, 1/3, 0.0, 1/3], # 14 + + [0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0], # 15, end state, no transform + ] +) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py new file mode 100644 index 00000000..483436ae --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_FrozenLake2.py @@ -0,0 +1,223 @@ +from sunau import AUDIO_FILE_ENCODING_ADPCM_G723_3 +import numpy as np +from enum import Enum + +# 状态 +class States(Enum): + Start = 0 + Safe1 = 1 + Hole2 = 2 + Safe3 = 3 + Safe4 = 4 + Safe5 = 5 + Safe6 = 6 + Safe7 = 7 + Hole8 = 8 + Safe9 = 9 + Hole10 = 10 + Safe11 = 11 + Safe12 = 12 + Safe13 = 13 + Safe14 = 14 + Goal15 = 15 + + +# 动作 对于4x4的方格,有正反48个动作(再减去进入终点后不能返回的数量) +class Actions(Enum): + a0001=0x0001 + a0102=0x0102 + a0203=0x0203 + a0100=0x0100 + a0201=0x0201 + a0302=0x0302 + + a0004=0x0004 + a0400=0x0400 + a0105=0x0105 + a0501 = 0x0501 + a0206=0x0206 + a0602 = 0x0602 + a0307=0x0307 + a0703 = 0x0703 + + a0405 = 0x0405 + a0506 = 0x0506 + a0607 = 0x0607 + a0504 = 0x0504 + a0605 = 0x0605 + a0706 = 0x0706 + + a0408 = 0x0408 + a0804 = 0x0804 + a0509 = 0x0509 + a0905 = 0x0905 + a0610 = 0x0610 + a1006 = 0x1006 + a0711 = 0x0711 + a1107 = 0x1107 + + a0809 = 0x0809 + a0910 = 0x0910 + a1011 = 0x1011 + a1110 = 0x1110 + a1009 = 0x1009 + a0908 = 0x0908 + + a0812 = 0x0812 + a1208 = 0x1208 + a0913 = 0x0913 + a1309 = 0x1309 + a1014 = 0x1014 + a1410 = 0x1410 + a1115 = 0x1115 + a1511 = 0x1511 + + a1213 = 0x1213 + a1314 = 0x1314 + a1415 = 0x1415 + + a1312 = 0x1312 + a1413 = 0x1413 + a1514 = 0x1514 + + +# 向前走动作F时, +# 到达前方s的概率是0.7, +# 滑到左侧的概率是0.2, +# 滑到左侧的概率是0.1, +# 如果是边角,前方概率不变,越界时呆在原地 +Front = 0.7 +Left = 0.2 +Right = 0.1 +# Reward +Hole = -1 +Goal = 5 + +# 数据在数组中的位置语义 +Action = 0 +ActionPi = 1 +Reward = 2 +StateProbs = 3 + +P=[ + [ # state 0: action, pi, reward, [state, prob] + [0x0001, 1/2, 0, [[1, Front],[0, Left],[4, Right]]], + [0x0004, 1/2, 0, [[4, Front],[1, Left],[0, Right]]] + ], + [ # state 1: action, prob, reward, [state, prob] + [0x0100, 1/3, 0, [[0, Front],[5, Left],[1, Right]]], + [0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]], + [0x0105, 1/3, 0, [[5, Front],[2, Left],[0, Right]]] + ], + [ # state 2: action, prob, reward, [state, prob] + #[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]], + #[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]], + #[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]] + [0x0202, 1, Hole, [[2, 1]]] + ], + [ # state 3: action, prob, reward, [state, prob] + [0x0302, 1/2, Hole, [[2, Front],[7, Left],[3, Right]]], + [0x0307, 1/2, 0, [[7, Front],[3, Left],[2, Right]]] + ], + ############# + [ # state 4: action, prob, reward, [state, prob] + [0x0400, 1/3, 0, [[0, Front],[4, Left],[5, Right]]], + [0x0405, 1/3, 0, [[5, Front],[0, Left],[8, Right]]], + [0x0408, 1/3, Hole, [[8, Front],[5, Left],[4, Right]]] + ], + [ # state 5: action, prob, reward, [state, prob] + [0x0501, 1/4, 0, [[1, Front],[4, Left],[6, Right]]], + [0x0504, 1/4, 0, [[4, Front],[9, Left],[1, Right]]], + [0x0506, 1/4, 0, [[6, Front],[1, Left],[9, Right]]], + [0x0509, 1/4, 0, [[9, Front],[6, Left],[4, Right]]] + ], + [ # state 6: action, prob, reward, [state, prob] + [0x0602, 1/4, Hole, [[2, Front],[5, Left],[7, Right]]], + [0x0605, 1/4, 0, [[5, Front],[10, Left],[2, Right]]], + [0x0607, 1/4, 0, [[7, Front],[2, Left],[10, Right]]], + [0x0610, 1/4, Hole, [[10, Front],[5, Left],[7, Right]]], + ], + [ # state 7: action, prob, reward, [state, prob] + [0x0703, 1/3, 0, [[3, Front],[6, Left],[7, Right]]], + [0x0706, 1/3, 0, [[6, Front],[11, Left],[3, Right]]], + [0x0711, 1/3, 0, [[11, Front],[7, Left],[6, Right]]] + ], + ################ + [ # state 8: action, prob, reward, [state, prob] + #[0x0804, 1/3, 0, [[4, Front],[8, Left],[9, Right]]], + #[0x0809, 1/3, 0, [[9, Front],[4, Left],[12, Right]]], + #[0x0812, 1/3, 0, [[12, Front],[9, Left],[8, Right]]] + [0x0808, 1, Hole, [[8, 1]]] + ], + [ # state 9: action, prob, reward, [state, prob] + [0x0905, 1/4, 0, [[5, Front],[8, Left],[10, Right]]], + [0x0908, 1/4, Hole, [[8, Front],[13, Left],[5, Right]]], + [0x0910, 1/4, Hole, [[10, Front],[5, Left],[13, Right]]], + [0x0913, 1/4, 0, [[13, Front],[10, Left],[8, Right]]] + ], + [ # state 10: action, prob, reward, [state, prob] + #[0x1006, 1/4, 0, [[6, Front],[9, Left],[11, Right]]], + #[0x1011, 1/4, 0, [[11, Front],[6, Left],[14, Right]]], + #[0x1014, 1/4, 0, [[14, Front],[11, Left],[9, Right]]], + #[0x1009, 1/4, 0, [[9, Front],[14, Left],[6, Right]]] + [0x1010, 1, Hole, [[10, 1]]] + ], + [ # state 11: action, prob, reward, [state, prob] + [0x1107, 1/3, 0, [[7, Front],[10, Left],[11, Right]]], + [0x1110, 1/3, Hole, [[10, Front],[15, Left],[7, Right]]], + [0x1115, 1/3, 0, [[15, Front],[15, Left],[10, Right]]] + ], + ########### + [ # state 12: action, prob, reward, [state, prob] + [0x1208, 1/2, Hole, [[8, Front],[12, Left],[13, Right]]], + [0x1213, 1/2, 0, [[13, Front],[8, Left],[12, Right]]] + ], + [ # state 13: action, prob, reward, [state, prob] + [0x1309, 1/3, 0, [[9, Front],[12, Left],[14, Right]]], + [0x1312, 1/3, 0, [[12, Front],[13, Left],[9, Right]]], + [0x1314, 1/3, 0, [[14, Front],[9, Left],[13, Right]]] + ], + [ # state 14: action, prob, reward, [state, prob] + [0x1410, 1/3, Hole, [[10, Front],[13, Left],[15, Right]]], + [0x1413, 1/3, 0, [[13, Front],[14, Left],[10, Right]]], + [0x1415, 1/3, Goal, [[15, Front],[10, Left],[14, Right]]] + #[0x1414, 1, Goal, [[14, 1]]] + ], + [ # state 15: action, prob, reward, [state, prob] + #[0x1511, 1/2, 0, [[15, Front],[14, Left], [15, Right]]], + #[0x1514, 1/2, 0, [[14, Front],[15, Left],[11, Right]]] + [0x1515, 1, Goal, [[15, 1]]] + ] + +] + +class DataParser(object): + def get_next_actions(self, curr_state): + actions_data = P[curr_state] + return actions_data + + def get_action_pi_reward(self, action_data): + return action_data[Action], action_data[ActionPi], action_data[Reward] + + def get_action_states_probs(self, action_data): + return action_data[StateProbs] + + def get_next_states_probs(self, action): + for state in P: + for actions_data in state: + if (actions_data[Action] == action): + return actions_data[Reward], actions_data[StateProbs] + return None, None + +''' +dataParser = DataParser() +data = dataParser.get_next_actions(0) +print(len(data)) +for i in range(len(data)): + a,p,r = dataParser.get_action_pi_reward(data[i]) + print(a,p,r) + sp = dataParser.get_action_states_probs(data[i]) + print(sp) +''' + + diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py new file mode 100644 index 00000000..a0858158 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Student.py @@ -0,0 +1,30 @@ + +import numpy as np +from enum import Enum + + +# 状态 +class States(Enum): + Class1 = 0 + Class2 = 1 + Class3 = 2 + Pass = 3 + Pub = 4 + Play = 5 + Sleep = 6 + +# 收益向量 +# [Class1, Class2, Class3, Pass, Pub, Play, Sleep] +Rewards = [-2, -2, -2, 10, 1, -1, 0] + +Matrix = np.array( + [ #Cl1 Cl2 Cl3 Pas Pub Ply Slp + [0.0, 0.5, 0.0, 0.0, 0.0, 0.5, 0.0], # Class1 + [0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 0.2], # CLass2 + [0.0, 0.0, 0.0, 0.6, 0.4, 0.0, 0.0], # Class3 + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], # Pass + [0.2, 0.4, 0.4, 0.0, 0.0, 0.0, 0.0], # Pub + [0.1, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0], # Play + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] # Sleep + ] +) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py new file mode 100644 index 00000000..13db8015 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/Data_Students2.py @@ -0,0 +1,59 @@ +from enum import Enum +import numpy as np + +# 状态 +class States(Enum): + Rest = 0 + Game = 1 + Class1 = 2 + Class2 = 3 + Class3 = 4 + +# 动作 +class Actions(Enum): + Quit = 0 + Play1 = 1 + Play2 = 2 + Study1 = 3 + Study2 = 4 + Pass = 5 + Pub = 6 + Sleep= 7 + +# 动作奖励 +Rewards = [0, -1, -1, -2, -2, 10, 1, 0] + +# 状态->动作概率 +Pi_sa = np.array([ + # S_Rest -> A_none + [0, 0, 0, 0, 0, 0, 0, 0], + # S_Game -> A_Quit, A_Play1 + [0.5, 0.5, 0, 0, 0, 0, 0, 0], + # S_Class1 -> A_Play2, A_Study1 + [0, 0, 0.5, 0.5, 0, 0, 0, 0], + # S_Class2 -> A_Study2, A_Sleep + [0, 0, 0, 0, 0.5, 0, 0, 0.5], + # S_Class3 -> A_Pass, A_Pub + [0, 0, 0, 0, 0, 0.5, 0.5, 0] +]) + +# 动作->状态概率 +P_as = np.array([ + # A_Quit -> S_Class1 + [0, 0, 1, 0, 0], + # A_Play1 -> S_Game + [0, 1, 0, 0, 0], + # A_Play2 -> S_Game + [0, 1, 0, 0, 0], + # A_Study1 -> S_Class2 + [0, 0, 0, 1, 0], + # A_Study2 -> S_Class3 + [0, 0, 0, 0, 1], + # A_Pass -> S_Rest + [1, 0, 0, 0, 0], + # A_Pub -> S_Class1, S_Class2, S_Class3 + [0, 0, 0.2, 0.4, 0.4], + # A_Sleep -> S_None + [0, 0, 0, 0, 0] +]) + diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py new file mode 100644 index 00000000..1f196fb4 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake6Grid.py @@ -0,0 +1,175 @@ +import numpy as np +from enum import Enum + +# 状态 +class States(Enum): + Goal0 = 0 + Safe1 = 1 + Hole2 = 2 + Safe3 = 3 + Safe4 = 4 + Safe5 = 5 + + +# 动作 对于4x4的方格,有正反48个动作(再减去进入终点后不能返回的数量) +class Actions(Enum): + a0001=0x0001 + a0102=0x0102 + a0100=0x0100 + a0201=0x0201 + + a0004=0x0004 + a0400=0x0400 + a0105=0x0105 + a0501 = 0x0501 + a0206=0x0206 + a0602 = 0x0602 + + a0405 = 0x0405 + a0506 = 0x0506 + a0504 = 0x0504 + a0605 = 0x0605 + + +# 向前走动作F时, +# 到达前方s的概率是0.7, +# 滑到左侧的概率是0.2, +# 滑到左侧的概率是0.1, +# 如果是边角,前方概率不变,越界时呆在原地 +Front = 0.7 +Left = 0.2 +Right = 0.1 +# Reward +Hole = -1 +Goal = 5 + +# 数据在数组中的位置语义 +Action = 0 +ActionPi = 1 +Reward = 2 +StateProbs = 3 + +P=[ + [ # state 0: action, pi, reward, [state, prob] + #[0x0000, 1, Goal, [[0, 1]]], + ], + [ # state 1: action, prob, reward, [state, prob] + [0x0100, 1/3, 0, [[0, Front],[4, Left],[1, Right]]], + [0x0102, 1/3, Hole, [[2, Front],[1, Left],[5, Right]]], + [0x0104, 1/3, 0, [[4, Front],[2, Left],[0, Right]]] + ], + [ # state 2: action, prob, reward, [state, prob] + #[0x0201, 1/3, 0, [[1, Front],[6, Left],[2, Right]]], + #[0x0203, 1/3, 0, [[3, Front],[2, Left],[6, Right]]], + #[0x0206, 1/3, 0, [[6, Front],[3, Left],[1, Right]]] + #[0x0202, 1, Hole, [[2, 1]]] + ], + + ############# + [ # state 3: action, prob, reward, [state, prob] + [0x0300, 1/2, 0, [[0, Front],[3, Left],[4, Right]]], + [0x0304, 1/2, 0, [[4, Front],[0, Left],[3, Right]]], + ], + [ # state 4: action, prob, reward, [state, prob] + [0x0401, 1/3, 0, [[1, Front],[3, Left],[5, Right]]], + [0x0403, 1/3, 0, [[3, Front],[4, Left],[1, Right]]], + [0x0405, 1/3, 0, [[5, Front],[1, Left],[4, Right]]], + ], + [ # state 5: action, prob, reward, [state, prob] + [0x0502, 1/2, Hole, [[2, Front],[4, Left],[5, Right]]], + [0x0504, 1/2, 0, [[4, Front],[5, Left],[2, Right]]], + ], + +] + +class DataParser(object): + def get_next_actions(self, curr_state): + actions_data = P[curr_state.value] + #print(actions_data) + return actions_data + + def get_action_pi_reward(self, action_data): + return action_data[Action], action_data[ActionPi], action_data[Reward] + + def get_action_states_probs(self, action_data): + return action_data[StateProbs] + + def get_next_states_probs(self, action): + for state in P: + for actions_data in state: + if (actions_data[Action] == action): + return actions_data[Reward], actions_data[StateProbs] + return None, None + + + +def V_pi(States, dataParser, gamma): + num_state = 6 + V_curr = [0.0] * num_state + V_next = [0.0] * num_state + count = 0 + # 迭代 + while (True): + # 遍历所有状态 s + for curr_state in States: + v_curr_sum = 0 + # 获得 状态->动作 策略概率 + actions_data = dataParser.get_next_actions(curr_state) + # 遍历每个策略概率 + for action_data in actions_data: + next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data) + # 获得 动作->状态 转移概率 + next_states_probs = dataParser.get_action_states_probs(action_data) + #next_states_prob = P_as[action_value] + v_sum = 0 + # 遍历每个转移概率 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + v_sum += next_state_prob * V_next[next_state_value] + #end for + # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] + v_curr_sum += next_action_prob * (reward + gamma * v_sum) + # end for + V_curr[curr_state.value] = v_curr_sum + #endfor + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next 迭代 + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next + +def Q2_pi(Actions, dataParser, gamma, vs): + Q = {} + # 遍历每个action + for curr_action in Actions: + q_sum = 0 + # 获得 动作->状态 转移概率 + reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value) + if (reward is None): + continue + #next_states_probs = P_as[curr_action.value] + # 遍历每个转移概率求和 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + q_sum += next_state_prob * vs[next_state_value] + # end for + # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s') + q = reward + gamma * q_sum + Q[curr_action.name] = q + #endfor + return Q + + +if __name__=="__main__": + gamma = 0.9 + dataParser = DataParser() + vs = V_pi(States, dataParser, gamma) + print(np.round(np.array(vs).reshape(2,3), 2)) + Q = Q2_pi(Actions, dataParser, gamma, vs) + for q in Q: + print(q, "={:.4f}".format(Q[q])) + diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py new file mode 100644 index 00000000..2ebe2df7 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_Optimal.py @@ -0,0 +1,142 @@ +import Data_FrozenLake2 as dfl2 +import numpy as np + +def V_star(States, dataParser, gamma): + num_state = len(States) + V_curr = [0.0] * num_state + V_next = [0.0] * num_state + count = 0 + # 迭代 + while (True): + # 遍历所有状态 s + for curr_state in States: + list_v = [] + # 获得 状态->动作 策略概率 + next_actions_datas = dataParser.get_next_actions(curr_state.value) + for next_action_data in next_actions_datas: + next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(next_action_data) + + # 获得 动作->状态 转移概率 + next_states_probs = dataParser.get_action_states_probs(next_action_data) + #next_states_prob = P_as[action_value] + v_sum = 0 + # 遍历每个转移概率 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + v_sum += next_state_prob * V_next[next_state_value] + #end for + # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] + list_v.append(reward + gamma * v_sum) + # end for + if (len(list_v) > 0): + V_curr[curr_state.value] = max(list_v) + #endfor + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next + +def Q_star(Actions, dataParser, gamma): + num_action = len(Actions) + Q_curr = [0.0] * num_action + Q_next = [0.0] * num_action + count = 0 + # 迭代 + while (count < 100): + # 遍历每个action + for curr_action in Actions: + q_curr_sum = 0 + # 获得 动作->状态 转移概率 + reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value) + if (reward is None): + continue + # 遍历每个转移概率求和 + for [next_state_value, next_state_prob] in next_states_probs: + # 获得 状态->动作 策略概率 + actions_datas = dataParser.get_next_actions(next_state_value) + list_q = [] + # 求最大值 + for action_data in actions_datas: + action, _, _ = dataParser.get_action_pi_reward(action_data) + list_q.append(Q_next[action]) + #end for + # math: \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') + if (len(list_q) > 0): + q_curr_sum += next_state_prob * max(list_q) + # end for + # math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') ) + Q_curr[curr_action.value] = reward + gamma * q_curr_sum + #endfor + # 检查收敛性 + if np.allclose(Q_next, Q_curr): + break + # 把 Q_curr 赋值给 Q_next + Q_next = Q_curr.copy() + count += 1 + # end while + print(count) + return Q_next + + +def Q_star_from_V_star(Actions, dataParser, gamma, v_star): + Q_star = {} + # 遍历每个action + for curr_action in Actions: + q_sum = 0 + # 获得 动作->状态 转移概率 + reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value) + if (reward is None): + continue + # 遍历每个转移概率求和 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{a'} P_{ss'}^a v_{*}(s') + q_sum += next_state_prob * v_star[next_state_value] + # end for + # math: R_s^a + \gamma ( \sum_{a'} P_{ss'}^a \max q_{\pi}(s',a') ) + Q_star[curr_action.name] = reward + gamma * q_sum + #endfor + return sorted(Q_star.items()) + + + +def find_next_best(Q, start): + action = None + value = None + for q in Q: + if (q[0].startswith(start)): + if action is None: + action = q[0] + value = q[1] + else: + if (q[1] > value): + action = q[0] + value = q[1] + return action, value + + +if __name__=="__main__": + gamma = 0.9 + dataParser = dfl2.DataParser() + vs = V_star(dfl2.States, dataParser, gamma) + print(np.round(np.array(vs).reshape(4,4), 2)) + + Q_star = Q_star_from_V_star(dfl2.Actions, dataParser, gamma, vs) + for q in Q_star: + print(q) + + start = "a00" + count = 0 + while(True): + action, value = find_next_best(Q_star, start) + print(action, value) + if (action is None): + break + start = "a" + action.replace(start, "") + count +=1 + if (count > 8): + break \ No newline at end of file diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py new file mode 100644 index 00000000..b7b882c5 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_FrozenLake_ValuePI.py @@ -0,0 +1,73 @@ +import Data_FrozenLake2 as dfl2 + +import numpy as np + +def V_pi(States, dataParser, gamma): + num_state = 16 + V_curr = [0.0] * num_state + V_next = [0.0] * num_state + count = 0 + # 迭代 + while (True): + # 遍历所有状态 s + for curr_state in States: + v_curr_sum = 0 + # 获得 状态->动作 策略概率 + actions_data = dataParser.get_next_actions(curr_state) + # 遍历每个策略概率 + for action_data in actions_data: + next_action_value, next_action_prob, reward = dataParser.get_action_pi_reward(action_data) + # 获得 动作->状态 转移概率 + next_states_probs = dataParser.get_action_states_probs(action_data) + #next_states_prob = P_as[action_value] + v_sum = 0 + # 遍历每个转移概率 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + v_sum += next_state_prob * V_next[next_state_value] + #end for + # math: \sum_a \pi(a|s) [R_s^a + \gamma \sum_{s'} P_{ss'}^a v_{\pi}(s')] + v_curr_sum += next_action_prob * (reward + gamma * v_sum) + # end for + V_curr[curr_state.value] = v_curr_sum + #endfor + # 检查收敛性 + if np.allclose(V_next, V_curr): + break + # 把 V_curr 赋值给 V_next 迭代 + V_next = V_curr.copy() + count += 1 + # end while + print(count) + return V_next + +def Q2_pi(Actions, dataParser, gamma, vs): + Q = {} + # 遍历每个action + for curr_action in Actions: + q_sum = 0 + # 获得 动作->状态 转移概率 + reward, next_states_probs = dataParser.get_next_states_probs(curr_action.value) + if (reward is None): + continue + #next_states_probs = P_as[curr_action.value] + # 遍历每个转移概率求和 + for [next_state_value, next_state_prob] in next_states_probs: + # math: \sum_{s'} P_{ss'}^a v_{\pi}(s') + q_sum += next_state_prob * vs[next_state_value] + # end for + # math: q_{\pi}(s,a)=R_s^a + \gamma \sum_{s' \in S} P_{ss'}^a v_{\pi}(s') + q = reward + gamma * q_sum + Q[curr_action.name] = q + #endfor + return Q + + +if __name__=="__main__": + gamma = 0.9 + dataParser = dfl2.DataParser() + vs = V_pi(dfl2.States, dataParser, gamma) + print(np.round(np.array(vs).reshape(4,4), 2)) + Q = Q2_pi(dfl2.Actions, dataParser, gamma, vs) + for q in Q: + print(q, "={:.4f}".format(Q[q])) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py new file mode 100644 index 00000000..646c32e6 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_Optimal.py @@ -0,0 +1,24 @@ +import Algorithm_MDP_Star as algoMS +import Data_Students2 as ds2 + +def Student_V_star(gamma): + v = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + for start_state in ds2.States: + print(start_state, "= {:.1f}".format(v[start_state.value])) + +def Student_Q_star(gamma): + v = algoMS.Q_star(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + for action in ds2.Actions: + print(action, "= {:.1f}".format(v[action.value])) + +def Student_Q_from_V_star(gamma): + v_star = algoMS.V_star(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + q_star = algoMS.Q_star_from_V_star(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_star) + for action in ds2.Actions: + print(action, "= {:.1f}".format(q_star[action.value])) + +if __name__=="__main__": + gamma = 1 + Student_V_star(gamma) + Student_Q_star(gamma) + Student_Q_from_V_star(gamma) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py new file mode 100644 index 00000000..0b1551ab --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MDP_Student_ValuePI.py @@ -0,0 +1,25 @@ +import Data_Students2 as ds2 +import Algorithm_MDP as mba + +def Student_V_Pi(gamma): + v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + for state in ds2.States: + print(state, "= {:.1f}".format(v_pi[state.value])) + return v_pi + +def Student_Q_Pi(gamma): + q_pi = mba.Q_pi(ds2.Actions, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + for action in ds2.Actions: + print(action, "= {:.1f}".format(q_pi[action.value])) + +def Student_Q_Pi_From_V_Pi(gamma): + v_pi = mba.V_pi(ds2.States, ds2.Pi_sa, ds2.P_as, ds2.Rewards, gamma) + q_pi = mba.Q_pi_from_V_pi(ds2.Actions, ds2.P_as, ds2.Rewards, gamma, v_pi) + for action in ds2.Actions: + print(action, "= {:.1f}".format(q_pi[action.value])) + +if __name__=="__main__": + gamma = 1 + Student_V_Pi(gamma) + Student_Q_Pi(gamma) + Student_Q_Pi_From_V_Pi(gamma) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py new file mode 100644 index 00000000..d49b81d4 --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_FrozenLake_Value.py @@ -0,0 +1,27 @@ +import numpy as np +import Algorithm_MPR as algoM +import Data_FrozenLake as dfl + + +def FrozenLake_MentoCarol(gamma): + episodes = 20000 + end_states = [dfl.States.Hole2, dfl.States.Hole8, dfl.States.Hole10, dfl.States.Goal15] + vs = algoM.MonteCarol(dfl.Rewards, dfl.Matrix, dfl.States, end_states, gamma, episodes) + print(np.round(np.array(vs).reshape(4,4), 2)) + +def FrozenLake_Matrix(gamma): + vs = algoM.Matrix(dfl, gamma) + print(np.round(np.array(vs).reshape(4,4), 2)) + +def FrozenLake_Bellman(gamma): + vs = algoM.Bellman(dfl.States, dfl.Matrix, dfl.Rewards, gamma) + np.set_printoptions(suppress=True) + print(np.round(np.array(vs).reshape(4,4), 2)) + + +if __name__=="__main__": + gamma = 1 + print(gamma) + #FrozenLake_MentoCarol(gamma) + FrozenLake_Matrix(gamma) + FrozenLake_Bellman(gamma) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py new file mode 100644 index 00000000..5e4f1a3e --- /dev/null +++ b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/MPR_Student_Value.py @@ -0,0 +1,30 @@ +import Data_Student as ds +import Algorithm_MPR as algoM +import numpy as np + + +def Student_MonteCarol(gamma): + episodes = 10000 + end_states = [ds.States.Sleep] + v = algoM.MonteCarol(ds.Rewards, ds.Matrix, ds.States, end_states, gamma, episodes) + for start_state in ds.States: + print(start_state, "= {:.2f}".format(v[start_state.value])) + + +def InvMatrix(gamma): + v = algoM.Matrix(ds, gamma) + for start_state in ds.States: + print(start_state, "= {:.2f}".format(v[start_state.value])) + return v + +def Bellman(gamma): + v = algoM.Bellman(ds.States, ds.Matrix, ds.Rewards, gamma) + for start_state in ds.States: + print(start_state, "= {:.2f}".format(v[start_state.value])) + + +if __name__=="__main__": + gamma = 0.9 + #Student_MonteCarol(gamma) + InvMatrix(gamma) + Bellman(gamma) diff --git a/基础教程/A7-强化学习/03-马尔可夫决策过程/src/__init__.py b/基础教程/A7-强化学习/03-马尔可夫决策过程/src/__init__.py new file mode 100644 index 00000000..e69de29b