You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

767 lines
31 KiB

<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>使用Policy Gradient玩乒乓球游戏 · GitBook</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="../recommand.html" />
<link rel="prev" href="Policy Gradient.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
简介
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../machine_learning.html">
<a href="../machine_learning.html">
机器学习概述
</a>
</li>
<li class="chapter " data-level="1.3" data-path="../algorithm.html">
<a href="../algorithm.html">
常见机器学习算法
</a>
<ul class="articles">
<li class="chapter " data-level="1.3.1" data-path="../kNN.html">
<a href="../kNN.html">
近朱者赤近墨者黑-kNN
</a>
</li>
<li class="chapter " data-level="1.3.2" data-path="../linear_regression.html">
<a href="../linear_regression.html">
最简单的回归算法-线性回归
</a>
</li>
<li class="chapter " data-level="1.3.3" data-path="../logistic_regression.html">
<a href="../logistic_regression.html">
使用回归的思想进行分类-逻辑回归
</a>
</li>
<li class="chapter " data-level="1.3.4" data-path="../decision_tree.html">
<a href="../decision_tree.html">
最接近人类思维的算法-决策树
</a>
</li>
<li class="chapter " data-level="1.3.5" data-path="../random_forest.html">
<a href="../random_forest.html">
群众的力量是伟大的-随机森林
</a>
</li>
<li class="chapter " data-level="1.3.6" data-path="../kMeans.html">
<a href="../kMeans.html">
物以类聚人以群分-kMeans
</a>
</li>
<li class="chapter " data-level="1.3.7" data-path="../AGNES.html">
<a href="../AGNES.html">
以距离为尺-AGNES
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.4" data-path="../metrics.html">
<a href="../metrics.html">
模型评估指标
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../classification_metrics.html">
<a href="../classification_metrics.html">
分类性能评估指标
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../regression_metrics.html">
<a href="../regression_metrics.html">
回归性能评估指标
</a>
</li>
<li class="chapter " data-level="1.4.3" data-path="../cluster_metrics.html">
<a href="../cluster_metrics.html">
聚类性能评估指标
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../sklearn.html">
<a href="../sklearn.html">
使用sklearn进行机器学习
</a>
</li>
<li class="chapter " data-level="1.6" >
<span>
综合实战案例
</span>
<ul class="articles">
<li class="chapter " data-level="1.6.1" >
<span>
泰坦尼克生还预测
</span>
<ul class="articles">
<li class="chapter " data-level="1.6.1.1" data-path="../titanic/introduction.html">
<a href="../titanic/introduction.html">
简介
</a>
</li>
<li class="chapter " data-level="1.6.1.2" data-path="../titanic/EDA.html">
<a href="../titanic/EDA.html">
探索性数据分析(EDA)
</a>
</li>
<li class="chapter " data-level="1.6.1.3" data-path="../titanic/feature engerning.html">
<a href="../titanic/feature engerning.html">
特征工程
</a>
</li>
<li class="chapter " data-level="1.6.1.4" data-path="../titanic/fit and predict.html">
<a href="../titanic/fit and predict.html">
构建模型进行预测
</a>
</li>
<li class="chapter " data-level="1.6.1.5" data-path="../titanic/tuning.html">
<a href="../titanic/tuning.html">
调参
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.6.2" >
<span>
使用强化学习玩乒乓球游戏
</span>
<ul class="articles">
<li class="chapter " data-level="1.6.2.1" data-path="what is reinforce learning.html">
<a href="what is reinforce learning.html">
什么是强化学习
</a>
</li>
<li class="chapter " data-level="1.6.2.2" data-path="Policy Gradient.html">
<a href="Policy Gradient.html">
Policy Gradient原理
</a>
</li>
<li class="chapter active" data-level="1.6.2.3" data-path="coding.html">
<a href="coding.html">
使用Policy Gradient玩乒乓球游戏
</a>
</li>
</ul>
</li>
</ul>
</li>
<li class="chapter " data-level="1.7" data-path="../recommand.html">
<a href="../recommand.html">
实训推荐
</a>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >使用Policy Gradient玩乒乓球游戏</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<h1 id="&#x4F7F;&#x7528;policy-gradient&#x73A9;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;">&#x4F7F;&#x7528;Policy Gradient&#x73A9;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;</h1>
<h2 id="&#x5B89;&#x88C5;-gym">&#x5B89;&#x88C5; gym</h2>
<p>&#x60F3;&#x8981;&#x73A9;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;&#xFF0C;&#x9996;&#x5148;&#x5F97;&#x6709;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;&#x3002;OpenAI &#x7684; gym &#x4E3A;&#x6211;&#x4EEC;&#x63D0;&#x4F9B;&#x4E86;&#x6A21;&#x62DF;&#x6E38;&#x620F;&#x7684;&#x73AF;&#x5883;&#x3002;&#x4F7F;&#x5F97;&#x6211;&#x4EEC;&#x80FD;&#x591F;&#x5F88;&#x65B9;&#x4FBF;&#x5730;&#x5F97;&#x5230;&#x6E38;&#x620F;&#x7684;&#x73AF;&#x5883;&#x72B6;&#x6001;&#xFF0C;&#x5E76;&#x4F5C;&#x51FA;&#x52A8;&#x4F5C;&#x3002;&#x60F3;&#x8981;&#x5B89;&#x88C5; gym &#x975E;&#x5E38;&#x7B80;&#x5355;&#xFF0C;&#x53EA;&#x8981;&#x5728;&#x547D;&#x4EE4;&#x884C;&#x4E2D;&#x8F93;&#x5165;<code>pip install gym</code>&#x5373;&#x53EF;&#x3002;</p>
<h2 id="&#x5B89;&#x88C5;-ataripy">&#x5B89;&#x88C5; atari_py</h2>
<p>&#x7531;&#x4E8E;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;&#x662F;&#x96C5;&#x8FBE;&#x5229;&#x6E38;&#x620F;&#x673A;&#x4E0A;&#x7684;&#x6E38;&#x620F;&#xFF0C;&#x6240;&#x4EE5;&#x9700;&#x8981;&#x5B89;&#x88C5; atari_py &#x6765;&#x5B9E;&#x73B0;&#x96C5;&#x8FBE;&#x5229;&#x73AF;&#x5883;&#x7684;&#x6A21;&#x62DF;&#x3002;&#x5B89;&#x88C5; atari_py &#x4E5F;&#x5F88;&#x65B9;&#x4FBF;&#xFF0C;&#x53EA;&#x9700;&#x5728;&#x547D;&#x4EE4;&#x884C;&#x4E2D;&#x8F93;&#x5165;<code>pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py</code> &#x5373;&#x53EF;&#x3002;</p>
<h2 id="&#x5F00;&#x542F;&#x6E38;&#x620F;">&#x5F00;&#x542F;&#x6E38;&#x620F;</h2>
<p>&#x5F53;&#x5B89;&#x88C5;&#x597D;&#x6240;&#x9700;&#x8981;&#x7684;&#x5E93;&#x4E4B;&#x540E;&#xFF0C;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x4F7F;&#x7528;&#x5982;&#x4E0B;&#x4EE3;&#x7801;&#x5F00;&#x59CB;&#x6E38;&#x620F;&#xFF1A;</p>
<pre><code class="lang-python"><span class="hljs-comment"># &#x5F00;&#x542F;&#x4E52;&#x4E53;&#x7403;&#x6E38;&#x620F;&#x73AF;&#x5883;</span>
<span class="hljs-keyword">import</span> gym
env = gym.make(<span class="hljs-string">&apos;Pong-v0&apos;</span>)
<span class="hljs-comment"># &#x4E00;&#x76F4;&#x6E32;&#x67D3;&#x6E38;&#x620F;&#x753B;&#x9762;</span>
<span class="hljs-keyword">while</span> <span class="hljs-keyword">True</span>:
env.render()
<span class="hljs-comment"># &#x968F;&#x673A;&#x505A;&#x52A8;&#x4F5C;&#xFF0C;&#x5E76;&#x5F97;&#x5230;&#x505A;&#x5B8C;&#x52A8;&#x4F5C;&#x4E4B;&#x540E;&#x7684;&#x73AF;&#x5883;(observation)&#xFF0C;&#x53CD;&#x9988;(reward)&#xFF0C;&#x662F;&#x5426;&#x7ED3;&#x675F;(done)</span>
observation, reward, done, _ = env.step(env.action_space.sample())
</code></pre>
<h2 id="&#x6E38;&#x620F;&#x753B;&#x9762;&#x9884;&#x5904;&#x7406;">&#x6E38;&#x620F;&#x753B;&#x9762;&#x9884;&#x5904;&#x7406;</h2>
<p>&#x7531;&#x4E8E;<code>env.step</code>&#x8FD4;&#x56DE;&#x51FA;&#x6765;&#x7684; observation &#x662F;&#x4E00;&#x5F20;RGB&#x7684;&#x4E09;&#x901A;&#x9053;&#x56FE;&#xFF0C;&#x800C;&#x4E14;&#x6211;&#x4EEC;&#x7684;&#x6321;&#x677F;&#x600E;&#x4E48;&#x79FB;&#x52A8;&#x53EA;&#x8DDF;&#x6321;&#x677F;&#x548C;&#x7403;&#x6709;&#x5173;&#x7CFB;&#xFF0C;&#x6240;&#x4EE5;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x5C1D;&#x8BD5;&#x5C06;&#x4E09;&#x901A;&#x9053;&#x56FE;&#x8F6C;&#x6362;&#x6210;&#x4E00;&#x5F20;&#x4E8C;&#x503C;&#x5316;&#x7684;&#x56FE;&#xFF0C;&#x5176;&#x4E2D;&#x6321;&#x677F;&#x548C;&#x7403;&#x662F; 1 &#xFF0C;&#x80CC;&#x666F;&#x662F; 0 &#x3002;</p>
<pre><code class="lang-python">
<span class="hljs-comment"># &#x6E38;&#x620F;&#x753B;&#x9762;&#x9884;&#x5904;&#x7406;</span>
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">prepro</span><span class="hljs-params">(I)</span>:</span>
I = I[<span class="hljs-number">35</span>:<span class="hljs-number">195</span>] <span class="hljs-comment">#&#x4E0D;&#x8981;&#x4E0A;&#x9762;&#x7684;&#x8BB0;&#x5206;&#x724C;</span>
I = I[::<span class="hljs-number">2</span>, ::<span class="hljs-number">2</span>, <span class="hljs-number">0</span>] <span class="hljs-comment">#scale 0.5&#xFF0C;&#x6240;&#x4EE5;I&#x662F;&#x9AD8;&#x4E3A;80&#xFF0C;&#x5BBD;&#x4E3A;80&#x7684;&#x5355;&#x901A;&#x9053;&#x56FE;</span>
I[I == <span class="hljs-number">144</span>] = <span class="hljs-number">0</span> <span class="hljs-comment"># &#x80CC;&#x666F;&#x8D4B;&#x503C;&#x4E3A;0</span>
I[I == <span class="hljs-number">109</span>] = <span class="hljs-number">0</span> <span class="hljs-comment"># &#x80CC;&#x666F;&#x8D4B;&#x503C;&#x4E3A;0</span>
I[I != <span class="hljs-number">0</span>] = <span class="hljs-number">1</span> <span class="hljs-comment"># &#x76EE;&#x6807;&#x4E3A;1</span>
<span class="hljs-keyword">return</span> I.astype(np.float).ravel() <span class="hljs-comment">#&#x5C06;&#x4E8C;&#x7EF4;&#x56FE;&#x538B;&#x6210;&#x4E00;&#x7EF4;&#x7684;&#x6570;&#x7EC4;</span>
<span class="hljs-comment"># cur_x&#x4E3A;&#x9884;&#x5904;&#x7406;&#x540E;&#x7684;&#x6E38;&#x620F;&#x753B;&#x9762;</span>
cur_x = prepro(observation)
</code></pre>
<p>&#x6E38;&#x620F;&#x7684;&#x753B;&#x9762;&#x662F;&#x9010;&#x5E27;&#x7EC4;&#x6210;&#x7684;&#xFF0C;&#x5982;&#x679C;&#x6211;&#x4EEC;&#x5C06;&#x5F53;&#x524D;&#x5E27;&#x548C;&#x4E0A;&#x4E00;&#x5E27;&#x7684;&#x56FE;&#x50CF;&#x76F8;&#x51CF;&#x5C31;&#x80FD;&#x5F97;&#x5230;&#x80FD;&#x591F;&#x8868;&#x793A;&#x4E24;&#x5E27;&#x4E4B;&#x95F4;&#x7684;&#x53D8;&#x5316;&#x7684;&#x5E27;&#x5DEE;&#x56FE;&#xFF0C;&#x5C06;&#x8FD9;&#x6837;&#x7684;&#x5E27;&#x5DEE;&#x56FE;&#x4F5C;&#x4E3A;&#x795E;&#x7ECF;&#x7F51;&#x7EDC;&#x7684;&#x8F93;&#x5165;&#x7684;&#x8BDD;&#x4F1A;&#x662F;&#x4E2A;&#x4E0D;&#x9519;&#x7684;&#x9009;&#x62E9;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-comment"># x&#x4E3A;&#x5E27;&#x5DEE;&#x56FE;</span>
x = cur_x - prev_x
<span class="hljs-comment"># &#x5C06;&#x5F53;&#x524D;&#x5E27;&#x66F4;&#x65B0;&#x4E3A;&#x4E0A;&#x4E00;&#x5E27;</span>
prev_x = cur_x
</code></pre>
<h2 id="&#x642D;&#x5EFA;&#x795E;&#x7ECF;&#x7F51;&#x7EDC;">&#x642D;&#x5EFA;&#x795E;&#x7ECF;&#x7F51;&#x7EDC;</h2>
<p>&#x795E;&#x7ECF;&#x7F51;&#x7EDC;&#x53EF;&#x4EE5;&#x6839;&#x636E;&#x81EA;&#x5DF1;&#x7684;&#x559C;&#x597D;&#x6765;&#x642D;&#x5EFA;&#xFF0C;&#x5728;&#x8FD9;&#x91CC;&#x6211;&#x4F7F;&#x7528;&#x6700;&#x7B80;&#x5355;&#x7684;&#x53EA;&#x6709;&#x4E24;&#x5C42;&#x5168;&#x8FDE;&#x63A5;&#x5C42;&#x7684;&#x7F51;&#x7EDC;&#x6A21;&#x578B;&#x6765;&#x8FDB;&#x884C;&#x9884;&#x6D4B;&#xFF0C;&#x7531;&#x4E8E;&#x6211;&#x4EEC;&#x6321;&#x677F;&#x7684;&#x52A8;&#x4F5C;&#x53EA;&#x6709;&#x4E0A;&#x548C;&#x4E0B;&#xFF0C;&#x6240;&#x4EE5;&#x6700;&#x540E;&#x7684;&#x6FC0;&#x6D3B;&#x51FD;&#x6570;&#x4E3A; sigmoid &#x51FD;&#x6570;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-comment"># &#x795E;&#x7ECF;&#x7F51;&#x7EDC;&#x4E2D;&#x795E;&#x7ECF;&#x5143;&#x7684;&#x53C2;&#x6570;</span>
model = {}
<span class="hljs-comment"># &#x968F;&#x673A;&#x521D;&#x59CB;&#x5316;&#x7B2C;&#x4E00;&#x5C42;&#x7684;&#x795E;&#x7ECF;&#x5143;&#x53C2;&#x6570;&#xFF0C;&#x603B;&#x5171;200&#x4E2A;&#x795E;&#x7ECF;&#x5143;</span>
model[<span class="hljs-string">&apos;W1&apos;</span>] = np.random.randn(H, D) / np.sqrt(D)
<span class="hljs-comment"># &#x968F;&#x673A;&#x521D;&#x59CB;&#x5316;&#x7B2C;&#x4E8C;&#x5C42;&#x7684;&#x795E;&#x7ECF;&#x5143;&#x53C2;&#x6570;&#xFF0C;&#x603B;&#x5171;200&#x4E2A;&#x795E;&#x7ECF;&#x5143;</span>
model[<span class="hljs-string">&apos;W2&apos;</span>] = np.random.randn(H) / np.sqrt(H)
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">sigmoid</span><span class="hljs-params">(x)</span>:</span>
<span class="hljs-keyword">return</span> <span class="hljs-number">1.0</span> / (<span class="hljs-number">1.0</span> + np.exp(-x))
<span class="hljs-comment"># &#x795E;&#x7ECF;&#x7F51;&#x7EDC;&#x7684;&#x524D;&#x5411;&#x4F20;&#x64AD;&#xFF0C;x&#x4E3A;&#x8F93;&#x5165;&#x7684;&#x5E27;&#x5DEE;&#x56FE;</span>
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">policy_forward</span><span class="hljs-params">(x)</span>:</span>
h = np.dot(model[<span class="hljs-string">&apos;W1&apos;</span>], x)
<span class="hljs-comment"># relu</span>
h[h &lt; <span class="hljs-number">0</span>] = <span class="hljs-number">0</span>
logp = np.dot(model[<span class="hljs-string">&apos;W2&apos;</span>], h)
<span class="hljs-comment"># sigmoid&#x6FC0;&#x6D3B;</span>
p = sigmoid(logp)
<span class="hljs-comment"># p&#x4E3A;&#x4E0B;&#x4E00;&#x6B65;&#x8981;&#x5F80;&#x4E0B;&#x632A;&#x7684;&#x6982;&#x7387;&#xFF0C;h&#x4E3A;&#x9690;&#x85CF;&#x5C42;&#x4E2D;&#x795E;&#x7ECF;&#x5143;&#x7684;&#x53C2;&#x6570;</span>
<span class="hljs-keyword">return</span> p, h
<span class="hljs-comment"># &#x7B97;&#x6BCF;&#x5C42;&#x7684;&#x53C2;&#x6570;&#x504F;&#x5BFC;&#xFF0C;eph&#x4E3A;&#x4E00;&#x4E2A;&#x6E38;&#x620F;&#x5E8F;&#x5217;&#x7684;&#x9690;&#x85CF;&#x5C42;&#x4E2D;&#x795E;&#x7ECF;&#x5143;&#x7684;&#x53C2;&#x6570;&#xFF0C;epdlogp&#x4E3A;&#x4E00;&#x4E2A;&#x6E38;&#x620F;&#x5E8F;&#x5217;&#x4E2D;&#x53CD;&#x9988;&#x671F;&#x671B;&#x7684;&#x504F;&#x5BFC;&#x3002;</span>
<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">policy_backward</span><span class="hljs-params">(eph, epdlogp)</span>:</span>
dW2 = np.dot(eph.T, epdlogp).ravel()
dh = np.outer(epdlogp, model[<span class="hljs-string">&apos;W2&apos;</span>])
dh[eph &lt;= <span class="hljs-number">0</span>] = <span class="hljs-number">0</span>
dW1 = np.dot(dh.T, epx)
<span class="hljs-keyword">return</span> {<span class="hljs-string">&apos;W1&apos;</span>: dW1, <span class="hljs-string">&apos;W2&apos;</span>: dW2}
</code></pre>
<h2 id="&#x8BAD;&#x7EC3;&#x795E;&#x7ECF;&#x7F51;&#x7EDC;">&#x8BAD;&#x7EC3;&#x795E;&#x7ECF;&#x7F51;&#x7EDC;</h2>
<pre><code class="lang-python"><span class="hljs-keyword">while</span> <span class="hljs-keyword">True</span>:
env.render()
<span class="hljs-comment"># &#x6E38;&#x620F;&#x753B;&#x9762;&#x9884;&#x5904;&#x7406;</span>
cur_x = prepro(observation)
<span class="hljs-comment"># &#x5F97;&#x5230;&#x5E27;&#x5DEE;&#x56FE;</span>
x = cur_x - prev_x <span class="hljs-keyword">if</span> prev_x <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">None</span> <span class="hljs-keyword">else</span> np.zeros(D)
<span class="hljs-comment"># &#x5C06;&#x4E0A;&#x4E00;&#x5E27;&#x66F4;&#x65B0;&#x4E3A;&#x5F53;&#x524D;&#x5E27;</span>
prev_x = cur_x
<span class="hljs-comment">#&#x524D;&#x5411;&#x4F20;&#x64AD;</span>
aprob, h = policy_forward(x)
<span class="hljs-comment">#&#x4ECE;&#x52A8;&#x4F5C;&#x6982;&#x7387;&#x5206;&#x5E03;&#x4E2D;&#x91C7;&#x6837;&#xFF0C;action=2&#x8868;&#x793A;&#x5F80;&#x4E0A;&#x632A;&#xFF0C;action=3&#x8868;&#x793A;&#x5F80;&#x4E0B;&#x632A;</span>
action = <span class="hljs-number">2</span> <span class="hljs-keyword">if</span> np.random.uniform() &lt; aprob <span class="hljs-keyword">else</span> <span class="hljs-number">3</span>
<span class="hljs-comment"># &#x73AF;&#x5883;</span>
xs.append(x)
<span class="hljs-comment"># &#x9690;&#x85CF;&#x5C42;&#x72B6;&#x6001;</span>
hs.append(h)
<span class="hljs-comment"># &#x5C06;2&#x548C;3&#x6539;&#x6210;1&#x548C;0&#xFF0C;&#x56E0;&#x4E3A;sigmoid&#x51FD;&#x6570;&#x7684;&#x5BFC;&#x6570;&#x4E3A;f(x)*(1-f(x))</span>
y = <span class="hljs-number">1</span> <span class="hljs-keyword">if</span> action == <span class="hljs-number">2</span> <span class="hljs-keyword">else</span> <span class="hljs-number">0</span>
dlogps.append(y - aprob)
<span class="hljs-comment"># &#x628A;&#x91C7;&#x6837;&#x5230;&#x7684;&#x52A8;&#x4F5C;&#x4F20;&#x56DE;&#x73AF;&#x5883;</span>
observation, reward, done, info = env.step(action)
<span class="hljs-comment"># &#x5982;&#x679C;&#x5F97;&#x4E00;&#x5206;&#x5219;reward&#x4E3A;1&#xFF0C;&#x4E22;&#x4E00;&#x4EFD;&#x5219;reward&#x4E3A;-1</span>
reward_sum += reward
<span class="hljs-comment"># &#x8BB0;&#x5F55;&#x53CD;&#x9988;</span>
drs.append(reward)
<span class="hljs-comment"># &#x5F53;&#x6709;&#x4E00;&#x65B9;&#x5F97;&#x5230;21&#x5206;&#x540E;&#x6E38;&#x620F;&#x7ED3;&#x675F;</span>
<span class="hljs-keyword">if</span> done:
episode_number += <span class="hljs-number">1</span>
epx = np.vstack(xs)
eph = np.vstack(hs)
epdlogp = np.vstack(dlogps)
epr = np.vstack(drs)
discounted_epr = discount_rewards(epr)
<span class="hljs-comment"># &#x5C06;&#x53CD;&#x9988;&#x8FDB;&#x884C;zscore&#x5F52;&#x4E00;&#x5316;&#xFF0C;&#x6709;&#x5229;&#x4E8E;&#x8BAD;&#x7EC3;</span>
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
<span class="hljs-comment">#&#x7B97;&#x671F;&#x671B;</span>
epdlogp *= discounted_epr
<span class="hljs-comment">#&#x7B97;&#x68AF;&#x5EA6;</span>
grad = policy_backward(eph, epdlogp)
<span class="hljs-keyword">for</span> k <span class="hljs-keyword">in</span> model:
grad_buffer[k] += grad[k]
<span class="hljs-comment"># &#x6BCF;batch_size&#x6B21;&#x6E38;&#x620F;&#x66F4;&#x65B0;&#x4E00;&#x6B21;&#x53C2;&#x6570;</span>
<span class="hljs-keyword">if</span> episode_number % batch_size == <span class="hljs-number">0</span>:
<span class="hljs-comment">#rmsprop&#x68AF;&#x5EA6;&#x4E0A;&#x5347;</span>
<span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> model.items():
g = grad_buffer[k]
rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (<span class="hljs-number">1</span> - decay_rate) * g ** <span class="hljs-number">2</span>
model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + <span class="hljs-number">1e-5</span>)
grad_buffer[k] = np.zeros_like(v)
<span class="hljs-comment"># &#x6BCF;100&#x628A;&#x4E4B;&#x540E;&#x4FDD;&#x5B58;&#x6A21;&#x578B;</span>
<span class="hljs-keyword">if</span> episode_number % <span class="hljs-number">100</span> == <span class="hljs-number">0</span>:
pickle.dump(model, open(<span class="hljs-string">&apos;save.p&apos;</span>, <span class="hljs-string">&apos;wb&apos;</span>))
reward_sum = <span class="hljs-number">0</span>
<span class="hljs-comment"># &#x91CD;&#x7F6E;&#x6E38;&#x620F;</span>
observation = env.reset()
prev_x = <span class="hljs-keyword">None</span>
</code></pre>
<h2 id="&#x52A0;&#x8F7D;&#x6A21;&#x578B;&#x73A9;&#x6E38;&#x620F;">&#x52A0;&#x8F7D;&#x6A21;&#x578B;&#x73A9;&#x6E38;&#x620F;</h2>
<p>&#x7ECF;&#x8FC7;&#x6F2B;&#x957F;&#x7684;&#x8BAD;&#x7EC3;&#x8FC7;&#x7A0B;&#x540E;&#xFF0C;&#x6211;&#x4EEC;&#x53EF;&#x4EE5;&#x5C06;&#x8BAD;&#x7EC3;&#x597D;&#x7684;&#x6A21;&#x578B;&#x52A0;&#x8F7D;&#x8FDB;&#x6765;&#x5F00;&#x59CB;&#x73A9;&#x6E38;&#x620F;&#x4E86;&#x3002;</p>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> pickle
<span class="hljs-keyword">import</span> gym
model = pickle.load(open(<span class="hljs-string">&apos;save.p&apos;</span>, <span class="hljs-string">&apos;rb&apos;</span>))
env = gym.make(<span class="hljs-string">&quot;Pong-v0&quot;</span>)
observation = env.reset()
<span class="hljs-keyword">while</span> <span class="hljs-keyword">True</span>:
env.render()
cur_x = prepro(observation)
x = cur_x - prev_x <span class="hljs-keyword">if</span> prev_x <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">None</span> <span class="hljs-keyword">else</span> np.zeros(<span class="hljs-number">80</span>*<span class="hljs-number">80</span>)
prev_x = cur_x
aprob, h = policy_forward(x)
<span class="hljs-comment">#&#x4ECE;&#x52A8;&#x4F5C;&#x6982;&#x7387;&#x5206;&#x5E03;&#x4E2D;&#x91C7;&#x6837;</span>
action = <span class="hljs-number">2</span> <span class="hljs-keyword">if</span> np.random.uniform() &lt; aprob <span class="hljs-keyword">else</span> <span class="hljs-number">3</span>
observation, reward, done, info = env.step(action)
<span class="hljs-keyword">if</span> done:
observation = env.reset()
prev_x = <span class="hljs-keyword">None</span>
</code></pre>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
<a href="Policy Gradient.html" class="navigation navigation-prev " aria-label="Previous page: Policy Gradient原理">
<i class="fa fa-angle-left"></i>
</a>
<a href="../recommand.html" class="navigation navigation-next " aria-label="Next page: 实训推荐">
<i class="fa fa-angle-right"></i>
</a>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"使用Policy Gradient玩乒乓球游戏","level":"1.6.2.3","depth":3,"next":{"title":"实训推荐","level":"1.7","depth":1,"path":"recommand.md","ref":"recommand.md","articles":[]},"previous":{"title":"Policy Gradient原理","level":"1.6.2.2","depth":3,"path":"pingpong/Policy Gradient.md","ref":"./pingpong/Policy Gradient.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["katex"],"pluginsConfig":{"katex":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"pingpong/coding.md","mtime":"2019-07-05T01:30:47.528Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-07-06T07:31:21.537Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
</body>
</html>