You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
682 lines
98 KiB
682 lines
98 KiB
5 years ago
|
|
||
|
<!DOCTYPE HTML>
|
||
|
<html lang="" >
|
||
|
<head>
|
||
|
<meta charset="UTF-8">
|
||
|
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
|
||
|
<title>Policy Gradient原理 · GitBook</title>
|
||
|
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
|
||
|
<meta name="description" content="">
|
||
|
<meta name="generator" content="GitBook 3.2.3">
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
<link rel="stylesheet" href="../gitbook/style.css">
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-katex/katex.min.css">
|
||
|
|
||
|
|
||
|
|
||
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
|
||
|
|
||
|
|
||
|
|
||
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
|
||
|
|
||
|
|
||
|
|
||
|
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
<meta name="HandheldFriendly" content="true"/>
|
||
|
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
|
||
|
<meta name="apple-mobile-web-app-capable" content="yes">
|
||
|
<meta name="apple-mobile-web-app-status-bar-style" content="black">
|
||
|
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
|
||
|
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
|
||
|
|
||
|
|
||
|
<link rel="next" href="coding.html" />
|
||
|
|
||
|
|
||
|
<link rel="prev" href="what is reinforce learning.html" />
|
||
|
|
||
|
|
||
|
</head>
|
||
|
<body>
|
||
|
|
||
|
<div class="book">
|
||
|
<div class="book-summary">
|
||
|
|
||
|
|
||
|
<div id="book-search-input" role="search">
|
||
|
<input type="text" placeholder="Type to search" />
|
||
|
</div>
|
||
|
|
||
|
|
||
|
<nav role="navigation">
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="summary">
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
<li class="chapter " data-level="1.1" data-path="../">
|
||
|
|
||
|
<a href="../">
|
||
|
|
||
|
|
||
|
简介
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.2" data-path="../machine_learning.html">
|
||
|
|
||
|
<a href="../machine_learning.html">
|
||
|
|
||
|
|
||
|
机器学习概述
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3" data-path="../algorithm.html">
|
||
|
|
||
|
<a href="../algorithm.html">
|
||
|
|
||
|
|
||
|
常见机器学习算法
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="articles">
|
||
|
|
||
|
|
||
|
<li class="chapter " data-level="1.3.1" data-path="../kNN.html">
|
||
|
|
||
|
<a href="../kNN.html">
|
||
|
|
||
|
|
||
|
近朱者赤近墨者黑-kNN
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.2" data-path="../linear_regression.html">
|
||
|
|
||
|
<a href="../linear_regression.html">
|
||
|
|
||
|
|
||
|
最简单的回归算法-线性回归
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.3" data-path="../logistic_regression.html">
|
||
|
|
||
|
<a href="../logistic_regression.html">
|
||
|
|
||
|
|
||
|
使用回归的思想进行分类-逻辑回归
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.4" data-path="../decision_tree.html">
|
||
|
|
||
|
<a href="../decision_tree.html">
|
||
|
|
||
|
|
||
|
最接近人类思维的算法-决策树
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.5" data-path="../random_forest.html">
|
||
|
|
||
|
<a href="../random_forest.html">
|
||
|
|
||
|
|
||
|
群众的力量是伟大的-随机森林
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.6" data-path="../kMeans.html">
|
||
|
|
||
|
<a href="../kMeans.html">
|
||
|
|
||
|
|
||
|
物以类聚人以群分-kMeans
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.3.7" data-path="../AGNES.html">
|
||
|
|
||
|
<a href="../AGNES.html">
|
||
|
|
||
|
|
||
|
以距离为尺-AGNES
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
|
</ul>
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.4" data-path="../metrics.html">
|
||
|
|
||
|
<a href="../metrics.html">
|
||
|
|
||
|
|
||
|
模型评估指标
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="articles">
|
||
|
|
||
|
|
||
|
<li class="chapter " data-level="1.4.1" data-path="../classification_metrics.html">
|
||
|
|
||
|
<a href="../classification_metrics.html">
|
||
|
|
||
|
|
||
|
分类性能评估指标
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.4.2" data-path="../regression_metrics.html">
|
||
|
|
||
|
<a href="../regression_metrics.html">
|
||
|
|
||
|
|
||
|
回归性能评估指标
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.4.3" data-path="../cluster_metrics.html">
|
||
|
|
||
|
<a href="../cluster_metrics.html">
|
||
|
|
||
|
|
||
|
聚类性能评估指标
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
|
</ul>
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.5" data-path="../sklearn.html">
|
||
|
|
||
|
<a href="../sklearn.html">
|
||
|
|
||
|
|
||
|
使用sklearn进行机器学习
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.6" >
|
||
|
|
||
|
<span>
|
||
|
|
||
|
|
||
|
综合实战案例
|
||
|
|
||
|
</span>
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="articles">
|
||
|
|
||
|
|
||
|
<li class="chapter " data-level="1.6.1" >
|
||
5 years ago
|
|
||
|
<span>
|
||
|
|
||
|
|
||
|
泰坦尼克生还预测
|
||
|
|
||
|
</span>
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="articles">
|
||
|
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.1.1" data-path="../titanic/introduction.html">
|
||
5 years ago
|
|
||
|
<a href="../titanic/introduction.html">
|
||
|
|
||
|
|
||
|
简介
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.1.2" data-path="../titanic/EDA.html">
|
||
5 years ago
|
|
||
|
<a href="../titanic/EDA.html">
|
||
|
|
||
|
|
||
|
探索性数据分析(EDA)
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.1.3" data-path="../titanic/feature engerning.html">
|
||
5 years ago
|
|
||
|
<a href="../titanic/feature engerning.html">
|
||
|
|
||
|
|
||
|
特征工程
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.1.4" data-path="../titanic/fit and predict.html">
|
||
5 years ago
|
|
||
|
<a href="../titanic/fit and predict.html">
|
||
|
|
||
|
|
||
|
构建模型进行预测
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.1.5" data-path="../titanic/tuning.html">
|
||
5 years ago
|
|
||
|
<a href="../titanic/tuning.html">
|
||
|
|
||
|
|
||
|
调参
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
|
</ul>
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.2" >
|
||
5 years ago
|
|
||
|
<span>
|
||
|
|
||
|
|
||
|
使用强化学习玩乒乓球游戏
|
||
|
|
||
|
</span>
|
||
|
|
||
|
|
||
|
|
||
|
<ul class="articles">
|
||
|
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.2.1" data-path="what is reinforce learning.html">
|
||
5 years ago
|
|
||
|
<a href="what is reinforce learning.html">
|
||
|
|
||
|
|
||
|
什么是强化学习
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter active" data-level="1.6.2.2" data-path="Policy Gradient.html">
|
||
5 years ago
|
|
||
|
<a href="Policy Gradient.html">
|
||
|
|
||
|
|
||
|
Policy Gradient原理
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
5 years ago
|
<li class="chapter " data-level="1.6.2.3" data-path="coding.html">
|
||
5 years ago
|
|
||
|
<a href="coding.html">
|
||
|
|
||
|
|
||
|
使用Policy Gradient玩乒乓球游戏
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
|
</ul>
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
5 years ago
|
</ul>
|
||
|
|
||
|
</li>
|
||
|
|
||
|
<li class="chapter " data-level="1.7" data-path="../recommand.html">
|
||
|
|
||
|
<a href="../recommand.html">
|
||
|
|
||
|
|
||
|
实训推荐
|
||
|
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</li>
|
||
|
|
||
|
|
||
5 years ago
|
|
||
|
|
||
|
<li class="divider"></li>
|
||
|
|
||
|
<li>
|
||
|
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
|
||
|
Published with GitBook
|
||
|
</a>
|
||
|
</li>
|
||
|
</ul>
|
||
|
|
||
|
|
||
|
</nav>
|
||
|
|
||
|
|
||
|
</div>
|
||
|
|
||
|
<div class="book-body">
|
||
|
|
||
|
<div class="body-inner">
|
||
|
|
||
|
|
||
|
|
||
|
<div class="book-header" role="navigation">
|
||
|
|
||
|
|
||
|
<!-- Title -->
|
||
|
<h1>
|
||
|
<i class="fa fa-circle-o-notch fa-spin"></i>
|
||
|
<a href=".." >Policy Gradient原理</a>
|
||
|
</h1>
|
||
|
</div>
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
<div class="page-wrapper" tabindex="-1" role="main">
|
||
|
<div class="page-inner">
|
||
|
|
||
|
<div id="book-search-results">
|
||
|
<div class="search-noresults">
|
||
|
|
||
|
<section class="normal markdown-section">
|
||
|
|
||
|
<h1 id="policy-gradient">Policy Gradient</h1>
|
||
|
<h2 id="policy-gradient的核心思想">Policy Gradient的核心思想</h2>
|
||
|
<p>其实 Policy Gradient 的核心思想非常简单,就是找一个函数<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>π</mi></mrow><annotation encoding="application/x-tex">\pi</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.03588em;">π</span></span></span></span>,这个函数<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>π</mi></mrow><annotation encoding="application/x-tex">\pi</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.03588em;">π</span></span></span></span>能够根据现在环境的状态(state)来产生接下来要采取的行动或者动作(action)。即<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>π</mi><mo>(</mo><mi>s</mi><mi>t</mi><mi>a</mi><mi>t</mi><mi>e</mi><mo>)</mo><mo>→</mo><mi>a</mi><mi>c</mi><mi>t</mi><mi>i</mi><mi>o</mi><mi>n</mi></mrow><annotation encoding="application/x-tex">\pi(state)\rightarrow action</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord mathit">s</span><span class="mord mathit">t</span><span class="mord mathit">a</span><span class="mord mathit">t</span><span class="mord mathit">e</span><span class="mclose">)</span><span class="mrel">→</span><span class="mord mathit">a</span><span class="mord mathit">c</span><span class="mord mathit">t</span><span class="mord mathit">i</span><span class="mord mathit">o</span><span class="mord mathit">n</span></span></span></span>。</p>
|
||
|
<p>函数<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>π</mi></mrow><annotation encoding="application/x-tex">\pi</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.03588em;">π</span></span></span></span>其实可以看成是一个模型,那么想在无数次尝试中寻找出能让 Agent 尽量拿高分的模型应该怎样来找呢?我相信您应该猜到了!没错!就是神经网络!</p>
|
||
|
<p>我们可以将游戏画面传给神经网络作为输入,然后神经网络预测一下当前游戏画面下,下一步动作的概率分布。</p>
|
||
|
<p><img src="../img/4.jpg" alt=""></p>
|
||
|
<p>细心的您可能会发现,如果每次取概率最高的动作作为下一步的动作,那不就成分类了么。其实 Policy Gradient 的并不是每次都选取概率最高的动作,而是根据动作的概率分布进行采样。也就是说就算我预测出来的向上挪的概率为 80% ,也不一定会向上挪。</p>
|
||
|
<p>那么为什么采样而不是直接选取概率最大的呢?因为这样很有灵性。可以想象一下,我们和别人下棋的时候,如果一直按照套路来下,那么对手很可能能够猜到我们下一步棋会怎么走,从而占据主动。如果我们时不时地不按套路出牌,但是这种不按套路的动作不会降低太多对于我们能够赢下这一局棋的几率。那么对手很可能会不知所措,主动权就掌握在我们手里。就像《天龙八部》中虚竹大破珍珑棋局时一样,可能有灵性一点,会有意想不到的效果。</p>
|
||
|
<p><img src="../img/5.jpg" alt=""></p>
|
||
|
<h2 id="policy-gradient-的原理">Policy Gradient 的原理</h2>
|
||
|
<p>现在已经知道 Policy Gradient 是通过神经网络来训练模型,该模型需要根据环境状态来预测出下一步动作的概率分布,并根据这个概率分布进行采样,将采样到的动作作为下一步的动作。</p>
|
||
|
<p>那么会有一个灵魂拷问,就是怎样来鉴定我的神经网络是好还是坏呢?很显然,当然是赢的越多越好了!所以我们不妨假设,让计算机玩 10 把乒乓球游戏,那么可能会有这样的一个统计结果。</p>
|
||
|
<p><img src="../img/6.jpg" alt=""></p>
|
||
|
<p>那么怎样评价这 10 把游戏打的好还是不好呢?也很明细,把 10 把游戏的所有反馈全部都加起来就好了。如果把这些反馈的和称为总反馈(总得分),那么就有<strong>总反馈(总得分)=第1把反馈1+第1把反馈2+...+第10把反馈m</strong>。也就是说总反馈越高越好。</p>
|
||
|
<p>说到这,有一个问题需要弄清楚:假设总共玩了 100 把,每 10 把计算一次总反馈,那么这 10 次的总反馈会不会是一模一样的呢?其实仔细想想会发现不会一摸一样,因为:</p>
|
||
|
<ul>
|
||
|
<li>游戏的状态实时在变,所以环境状态不可能一直是一样的。</li>
|
||
|
<li>动作是从一个概率分布中采样出来的。</li>
|
||
|
</ul>
|
||
|
<p>既然总反馈一直会变,那么我们可以尝试换一种思路,即计算总反馈的期望,即总反馈的期望越高越好。那这个期望怎么算呢?</p>
|
||
|
<p>首先我们可以将每一把游戏看成一个游戏序列(<strong>状态1->动作1->反馈1->状态2->动作2->反馈2 ... 状态N->动作N->反馈N</strong>)。那么每一个游戏序列(即每一把游戏)的<strong>反馈=反馈1+反馈2+...+反馈N</strong>。因此,若假设<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>R</mi><mo>(</mo><mi>τ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">R(\tau)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mclose">)</span></span></span></span>表示游戏序列<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>τ</mi></mrow><annotation encoding="application/x-tex">\tau</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.1132em;">τ</span></span></span></span>的反馈,则有:<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>R</mi><mo>(</mo><mi>τ</mi><mo>)</mo><mo>=</mo><msubsup><mo>∑</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><msub><mi>τ</mi><mi>n</mi></msub></mrow><annotation encoding="application/x-tex">R(\tau)=\sum_{n=1}^N\tau_n</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8423309999999999em;"></span><span class="strut bottom" style="height:1.142341em;vertical-align:-0.30001em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mclose">)</span><span class="mrel">=</span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">n</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.1132em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit
|
||
|
<p>如果我们把整个乒乓球游戏所有可能出现的状态,动作,反馈组合起来看成是玩了 N(N很大很大) 把游戏,就会有 N 个游戏序列(<strong>游戏序列1,游戏序列2,游戏序列3, ... , 游戏序列N</strong>)。那么我们在玩游戏时所得到的游戏序列实际上就是从这 N 个游戏序列中采样得到的。</p>
|
||
|
<p>所以我们游戏的总的反馈期望<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover></mrow><annotation encoding="application/x-tex">\overline{R_\theta}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.03333em;vertical-align:-0.15em;"></span><span class="base textstyle uncramped"><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span></span></span></span>可表示为:<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover><mo>=</mo><msub><mo>∑</mo><mi>τ</mi></msub><mi>R</mi><mo>(</mo><mi>τ</mi><mo>)</mo><mi>P</mi><mo>(</mo><mi>τ</mi><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">\overline{R_\theta}=\sum_\tau R(\tau)P(\tau|\theta)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.18334em;vertical-align:-0.30001em;"></span><span class="base textstyle uncramped"><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span><span class="mrel">=</span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-right:0.05em;margin-left:0em;"><span class="fo
|
||
|
<p><img src="../img/7.jpg" alt=""></p>
|
||
|
<p>假设我们玩了 10 把游戏,就相当于得到了 10 个游戏序列[<span class="katex"><span class="katex-mathml"><math><semantics><mrow><msub><mi>τ</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>τ</mi><mn>2</mn></msub><mo separator="true">,</mo><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mo separator="true">,</mo><msub><mi>τ</mi><mrow><mn>1</mn><mn>0</mn></mrow></msub></mrow><annotation encoding="application/x-tex">\tau_1, \tau_2, ..., \tau_{10}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.625em;vertical-align:-0.19444em;"></span><span class="base textstyle uncramped"><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.1132em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.1132em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">2</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathrm">.</span><span class="mord mathrm">.</span><span class="mord mathrm">.</span><span class="mpunct">,</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.1132em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span><span class="mord mathrm mtight">0</span></span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span></span>]。这 10 个游戏序列就相当于从 P 中采样了 10 次<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>τ</mi></mrow><annotation encoding="application/x-tex">\tau</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.1132em;">τ</span></span></span></span>。所以总反馈期望<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover></mrow><annotation encoding="application/x-tex">\overline{R_\theta}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.03333em;vertical-ali
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover><mo>≈</mo><mfrac><mrow><mn>1</mn></mrow><mrow><mi>N</mi></mrow></mfrac><msubsup><mo>∑</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><mi>R</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mo>)</mo></mrow><annotation encoding="application/x-tex">
|
||
|
\overline{R_\theta} \approx \frac{1}{N}\sum_{n=1}^NR(\tau^n)
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.2283300000000001em;vertical-align:-0.345em;"></span><span class="base textstyle uncramped"><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span><span class="mrel">≈</span><span class="mord reset-textstyle textstyle uncramped"><span class="mopen sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span><span class="mfrac"><span class="vlist"><span style="top:0.345em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span></span><span style="top:-0.22999999999999998em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle textstyle uncramped frac-line"></span></span><span style="top:-0.394em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord scriptstyle uncramped mtight"><span class="mord mathrm mtight">1</span></span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span><span class="mclose sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span></span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">n</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:-0.363em;margin-right:0.05e
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>由于<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover></mrow><annotation encoding="application/x-tex">\overline{R_\theta}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.03333em;vertical-align:-0.15em;"></span><span class="base textstyle uncramped"><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span></span></span></span>的值越大越好,所以我们可以使用梯度上升的方式来更新<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>θ</mi></mrow><annotation encoding="application/x-tex">\theta</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.69444em;"></span><span class="strut bottom" style="height:0.69444em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.02778em;">θ</span></span></span></span>。所以就有如下数学推导:</p>
|
||
|
<p><div align="center"><img src="../img/8.jpg" alt=""></div></p>
|
||
|
<p>又由于:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover><mo>=</mo><msub><mo>∑</mo><mi>τ</mi></msub><mi>R</mi><mo>(</mo><mi>τ</mi><mo>)</mo><mi>P</mi><mo>(</mo><mi>τ</mi><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo><mo>≈</mo><mfrac><mrow><mn>1</mn></mrow><mrow><mi>N</mi></mrow></mfrac><msubsup><mo>∑</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><mi>R</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mo>)</mo></mrow><annotation encoding="application/x-tex">
|
||
|
\overline{R_\theta} = \sum_\tau R(\tau)P(\tau|\theta) \approx \frac{1}{N}\sum_{n=1}^NR(\tau^n)
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.2283300000000001em;vertical-align:-0.345em;"></span><span class="base textstyle uncramped"><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span><span class="mrel">=</span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.1132em;">τ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mclose">)</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mord mathrm">∣</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mrel">≈</span><span class="mord reset-textstyle textstyle uncramped"><span class="mopen sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span><span class="mfrac"><span class="vlist"><span style="top:0.345em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span></span><span style="top:-0.22999999999999998em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle textstyle uncramped frac-line"></span></span><span style="top:-0.394em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord scriptstyle uncramped mtight"><span class="mord mathrm mtight">1</span></span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span><span class="mclose sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span></span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msu
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>所以就有:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi mathvariant="normal">∇</mi><mover accent="true"><mrow><msub><mi>R</mi><mi>θ</mi></msub></mrow><mo stretchy="true">‾</mo></mover><mo>≈</mo><mfrac><mrow><mn>1</mn></mrow><mrow><mi>N</mi></mrow></mfrac><msubsup><mo>∑</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><mi>R</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mo>)</mo><mi mathvariant="normal">∇</mi><mi>l</mi><mi>o</mi><mi>g</mi><mi>P</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">
|
||
|
\nabla \overline{R_\theta} \approx \frac{1}{N}\sum_{n=1}^NR(\tau^n) \nabla logP(\tau^n|\theta)
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8833300000000001em;"></span><span class="strut bottom" style="height:1.2283300000000001em;vertical-align:-0.345em;"></span><span class="base textstyle uncramped"><span class="mord mathrm">∇</span><span class="mord overline"><span class="vlist"><span style="top:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="mord textstyle cramped"><span class="mord"><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.00773em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.02778em;">θ</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span></span></span><span style="top:-0.80333em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span><span class="reset-textstyle textstyle uncramped overline-line"></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:1em;">​</span></span>​</span></span></span><span class="mrel">≈</span><span class="mord reset-textstyle textstyle uncramped"><span class="mopen sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span><span class="mfrac"><span class="vlist"><span style="top:0.345em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span></span><span style="top:-0.22999999999999998em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle textstyle uncramped frac-line"></span></span><span style="top:-0.394em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord scriptstyle uncramped mtight"><span class="mord mathrm mtight">1</span></span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span><span class="mclose sizing reset-size5 size5 reset-textstyle textstyle uncramped nulldelimiter"></span></span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">n</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><sp
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>您会发现<span class="katex"><span class="katex-mathml"><math><semantics><mrow><msubsup><mo>∑</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></msubsup><mi>R</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mo>)</mo></mrow><annotation encoding="application/x-tex">\sum_{n=1}^NR(\tau^n)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8423309999999999em;"></span><span class="strut bottom" style="height:1.142341em;vertical-align:-0.30001em;"></span><span class="base textstyle uncramped"><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">n</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.10903em;">N</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathit" style="margin-right:0.00773em;">R</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:-0.363em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight">n</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mclose">)</span></span></span></span>很好算,只要把反馈全部加起来就完事了,难算的是<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi mathvariant="normal">∇</mi><mi>l</mi><mi>o</mi><mi>g</mi><mi>P</mi><mo>(</mo><msup><mi>τ</mi><mi>n</mi></msup><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">\nabla logP(\tau^n|\theta)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathrm">∇</span><span class="mord mathit" style="margin-right:0.01968em;">l</span><span class="mord mathit">o</span><span class="mord mathit" style="margin-right:0.03588em;">g</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:-0.363em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight">n</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span></span
|
||
|
<p>由于一个游戏序列<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>τ</mi></mrow><annotation encoding="application/x-tex">\tau</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.43056em;"></span><span class="strut bottom" style="height:0.43056em;vertical-align:0em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.1132em;">τ</span></span></span></span>是由多个状态,动作,反馈构成的,即:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>τ</mi><mo>=</mo><mo>{</mo><msub><mi>s</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>a</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>r</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>s</mi><mn>2</mn></msub><mo separator="true">,</mo><msub><mi>a</mi><mn>2</mn></msub><mo separator="true">,</mo><msub><mi>r</mi><mn>2</mn></msub><mo separator="true">,</mo><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mo separator="true">,</mo><msub><mi>s</mi><mi>T</mi></msub><mo separator="true">,</mo><msub><mi>a</mi><mi>T</mi></msub><mo separator="true">,</mo><msub><mi>r</mi><mi>T</mi></msub><mo>}</mo></mrow><annotation encoding="application/x-tex">
|
||
|
\tau=\{s_1, a_1, r_1, s_2, a_2, r_2, ..., s_T, a_T, r_T\}
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mrel">=</span><span class="mopen">{</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.02778em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">2</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">2</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.02778em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">2</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathrm">.</span><span class="mord mathrm">.</span><span class="mord mathrm">.</span><span class="mpunct">,</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>所以:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>P</mi><mo>(</mo><mi>τ</mi><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo><mo>=</mo><mi>P</mi><mo>(</mo><msub><mi>s</mi><mn>1</mn></msub><mo>)</mo><mi>P</mi><mo>(</mo><msub><mi>a</mi><mn>1</mn></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mn>1</mn></msub><mo separator="true">,</mo><mi>θ</mi><mo>)</mo><mi>P</mi><mo>(</mo><msub><mi>r</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>s</mi><mn>2</mn></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mn>1</mn></msub><mo separator="true">,</mo><msub><mi>a</mi><mn>1</mn></msub><mo>)</mo><mi>P</mi><mo>(</mo><msub><mi>a</mi><mn>2</mn></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mn>2</mn></msub><mo separator="true">,</mo><mi>θ</mi><mo>)</mo><mi>P</mi><mo>(</mo><msub><mi>r</mi><mn>2</mn></msub><mo separator="true">,</mo><msub><mi>s</mi><mn>3</mn></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mn>2</mn></msub><mo separator="true">,</mo><msub><mi>a</mi><mn>2</mn></msub><mo>)</mo><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi><mi mathvariant="normal">.</mi></mrow><annotation encoding="application/x-tex">
|
||
|
P(\tau|\theta)=P(s_1)P(a_1|s_1,\theta)P(r_1,s_2|s_1,a_1)P(a_2|s_2,\theta)P(r_2,s_3|s_2,a_2)...
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mord mathrm">∣</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mrel">=</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mclose">)</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.02778em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">2</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><spa
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>稍微整理一下可知:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>P</mi><mo>(</mo><mi>τ</mi><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo><mo>=</mo><mi>P</mi><mo>(</mo><msub><mi>s</mi><mn>1</mn></msub><mo>)</mo><msubsup><mo>∏</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mi>T</mi></msubsup><mi>P</mi><mo>(</mo><msub><mi>a</mi><mi>t</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mi>t</mi></msub><mo separator="true">,</mo><mi>θ</mi><mo>)</mo><mi>P</mi><mo>(</mo><msub><mi>τ</mi><mi>t</mi></msub><mo separator="true">,</mo><msub><mi>s</mi><mrow><mi>t</mi><mo>+</mo><mn>1</mn></mrow></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mi>t</mi></msub><mo separator="true">,</mo><msub><mi>a</mi><mi>t</mi></msub><mo>)</mo></mrow><annotation encoding="application/x-tex">
|
||
|
P(\tau|\theta)=P(s_1)\prod_{t=1}^TP(a_t|s_t,\theta)P(\tau_t,s_{t+1}|s_t,a_t)
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8423309999999999em;"></span><span class="strut bottom" style="height:1.142341em;vertical-align:-0.30001em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mord mathrm">∣</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mrel">=</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathrm mtight">1</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mclose">)</span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∏</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">t</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.13889em;">T</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:-0.1132em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</sp
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p>然后两边取<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>l</mi><mi>o</mi><mi>g</mi></mrow><annotation encoding="application/x-tex">log</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.69444em;"></span><span class="strut bottom" style="height:0.8888799999999999em;vertical-align:-0.19444em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.01968em;">l</span><span class="mord mathit">o</span><span class="mord mathit" style="margin-right:0.03588em;">g</span></span></span></span>会得到:</p>
|
||
|
<p><center>
|
||
|
<span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>l</mi><mi>o</mi><mi>g</mi><mi>P</mi><mo>(</mo><mi>τ</mi><mi mathvariant="normal">∣</mi><mi>θ</mi><mo>)</mo><mo>=</mo><msubsup><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mi>T</mi></msubsup><mi mathvariant="normal">∇</mi><mi>l</mi><mi>o</mi><mi>g</mi><mi>P</mi><mo>(</mo><msub><mi>a</mi><mi>t</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mi>t</mi></msub><mo separator="true">,</mo><mi>θ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">
|
||
|
logP(\tau|\theta)=\sum_{t=1}^T\nabla logP(a_t|s_t,\theta)
|
||
|
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.8423309999999999em;"></span><span class="strut bottom" style="height:1.142341em;vertical-align:-0.30001em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.01968em;">l</span><span class="mord mathit">o</span><span class="mord mathit" style="margin-right:0.03588em;">g</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord mathit" style="margin-right:0.1132em;">τ</span><span class="mord mathrm">∣</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span><span class="mrel">=</span><span class="mop"><span class="mop op-symbol small-op" style="top:-0.0000050000000000050004em;">∑</span><span class="msupsub"><span class="vlist"><span style="top:0.30001em;margin-left:0em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord scriptstyle cramped mtight"><span class="mord mathit mtight">t</span><span class="mrel mtight">=</span><span class="mord mathrm mtight">1</span></span></span></span><span style="top:-0.364em;margin-right:0.05em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle uncramped mtight"><span class="mord mathit mtight" style="margin-right:0.13889em;">T</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∇</span><span class="mord mathit" style="margin-right:0.01968em;">l</span><span class="mord mathit">o</span><span class="mord mathit" style="margin-right:0.03588em;">g</span><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span></span></span></span>
|
||
|
</center>
|
||
|
<br></p>
|
||
|
<p><span class="katex"><span class="katex-mathml"><math><semantics><mrow><mi>P</mi><mo>(</mo><msub><mi>a</mi><mi>t</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mi>t</mi></msub><mo separator="true">,</mo><mi>θ</mi><mo>)</mo></mrow><annotation encoding="application/x-tex">P(a_t|s_t,\theta)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="strut" style="height:0.75em;"></span><span class="strut bottom" style="height:1em;vertical-align:-0.25em;"></span><span class="base textstyle uncramped"><span class="mord mathit" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord"><span class="mord mathit">a</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mord mathrm">∣</span><span class="mord"><span class="mord mathit">s</span><span class="msupsub"><span class="vlist"><span style="top:0.15em;margin-right:0.05em;margin-left:0em;"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span><span class="reset-textstyle scriptstyle cramped mtight"><span class="mord mathit mtight">t</span></span></span><span class="baseline-fix"><span class="fontsize-ensurer reset-size5 size5"><span style="font-size:0em;">​</span></span>​</span></span></span></span><span class="mpunct">,</span><span class="mord mathit" style="margin-right:0.02778em;">θ</span><span class="mclose">)</span></span></span></span>其实就是我们神经网络根据环境状态预测出来的下一步的动作概率分布。</p>
|
||
|
<p><img src="../img/9.jpg" alt=""></p>
|
||
|
<p>OK,到这里,Policy Gradient 的数学推导全部推导完毕了。我们不妨用一张图来总结一下 Policy Gradient 的算法流程。流程如下:</p>
|
||
|
<p><img src="../img/10.jpg" alt=""></p>
|
||
|
|
||
|
|
||
|
</section>
|
||
|
|
||
|
</div>
|
||
|
<div class="search-results">
|
||
|
<div class="has-results">
|
||
|
|
||
|
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
|
||
|
<ul class="search-results-list"></ul>
|
||
|
|
||
|
</div>
|
||
|
<div class="no-results">
|
||
|
|
||
|
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
|
||
|
|
||
|
</div>
|
||
|
</div>
|
||
|
</div>
|
||
|
|
||
|
</div>
|
||
|
</div>
|
||
|
|
||
|
</div>
|
||
|
|
||
|
|
||
|
|
||
|
<a href="what is reinforce learning.html" class="navigation navigation-prev " aria-label="Previous page: 什么是强化学习">
|
||
|
<i class="fa fa-angle-left"></i>
|
||
|
</a>
|
||
|
|
||
|
|
||
|
<a href="coding.html" class="navigation navigation-next " aria-label="Next page: 使用Policy Gradient玩乒乓球游戏">
|
||
|
<i class="fa fa-angle-right"></i>
|
||
|
</a>
|
||
|
|
||
|
|
||
|
|
||
|
</div>
|
||
|
|
||
|
<script>
|
||
|
var gitbook = gitbook || [];
|
||
|
gitbook.push(function() {
|
||
5 years ago
|
gitbook.page.hasChanged({"page":{"title":"Policy Gradient原理","level":"1.6.2.2","depth":3,"next":{"title":"使用Policy Gradient玩乒乓球游戏","level":"1.6.2.3","depth":3,"path":"pingpong/coding.md","ref":"./pingpong/coding.md","articles":[]},"previous":{"title":"什么是强化学习","level":"1.6.2.1","depth":3,"path":"pingpong/what is reinforce learning.md","ref":"./pingpong/what is reinforce learning.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":["katex"],"pluginsConfig":{"katex":{},"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"pingpong/Policy Gradient.md","mtime":"2019-07-05T02:04:09.124Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-07-06T07:31:21.537Z"},"basePath":"..","book":{"language":""}});
|
||
5 years ago
|
});
|
||
|
</script>
|
||
|
</div>
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook.js"></script>
|
||
|
<script src="../gitbook/theme.js"></script>
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
|
||
|
|
||
|
|
||
|
|
||
|
</body>
|
||
|
</html>
|
||
|
|