BGD 損失率圖解
BGD 是Bitch Gradient Descent (批次梯度下降) 的縮寫。以迴歸線為例,底下是產生迴歸線資料 Regression.py 的代碼
import numpy as np np.random.seed(1) def getData(n): x = np.arange(-5, 5.1, 10 / n) y = 3 * x + 2 + (np.random.rand(len(x)) - 0.5) * 20 return x, y def getLoss(xs, ys): a=np.arange(-10,16,1) b=np.arange(-10,16,1) mesh=np.meshgrid(a,b) loss=0 for x, y in zip(xs, ys): loss+=((mesh[0]*x+mesh[1])-y)**2 loss/=len(xs) return mesh,loss
BDG的損失函數梯度下降後,
$(\frac{\sigma Loss(a,b)}{\sigma a}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}})*x_{i})$
$(\frac{\sigma Loss(a,b)}{\sigma b}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}}))$
底下是BDG 類別 BDG.py 的代碼
import numpy as np
class BGD():
def __init__(self, a, b, x, y, lr):
self.a = a
self.b = b
self.x = x
self.y = y
self.lr = lr
self.a_old = a
self.b_old = b
self.loss = None;
# Loss function
def mse(self):
loss = ((self.a * self.x + self.b) - self.y) ** 2
return np.mean(loss)
def gradient(self):
grad_a = 2 * np.mean((self.a * self.x + self.b - self.y) * (self.x))
grad_b = 2 * np.mean((self.a * self.x + self.b - self.y))
return grad_a, grad_b
def update(self):
# 計算梯度
grad_a, grad_b = self.gradient()
# 梯度更新
self.a_old = self.a
self.b_old = self.b
self.a = self.a - self.lr * grad_a
self.b = self.b - self.lr * grad_b
self.loss = self.mse()
底下是主程式的代碼
#BGD : Batch Gradient Descent : 批量梯度下降 import threading import time import pylab as plt from BGD import BGD from Regression import * def runnable(): for i in range(epoch): gd.update() a=gd.a b=gd.b loss=gd.loss ax[0].clear() ax[0].set_xlim(-5, 5) ax[0].set_ylim(-30,30) ax[0].scatter(x, y) ax[0].plot([x[0], x[-1]], [a*x[0]+b, a*x[-1]+b], c="orange") ax[0].set_title(f'{a:.6f}x+{b:.6f}') ax[1].set_xlim(-10, 15) ax[1].set_ylim(-10, 15) ax[1].set_title(f'iter:{i+1:03d}, loss={loss:.6f}') ax[1].plot([gd.a_old, a],[gd.b_old,b],c='r') ax[1].scatter(a, b, c='g') plt.draw() time.sleep(0.5) epoch=50 x,y = getData(100) mesh, loss=getLoss(x,y) fig,ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4)) a2=ax[1].contourf(mesh[0], mesh[1], loss, 15, cmap=plt.cm.Purples) ax[1].set_xlabel("a") ax[1].set_ylabel("b") plt.colorbar(a2, ax=ax[1]) #lr=0.058 lr = 0.1 a=-9#初始值 b=-9 ax[1].scatter(a, b, c='g') gd=BGD(a, b, x, y, lr) t=threading.Thread(target=runnable) t.start() plt.show()
SGD 隨機梯度下降
SGD 為 Stochastic [stəˈkæstɪk] Gradient Descent 的縮寫。
BDG 損失函數梯度下降後,公式為$(\frac{\sigma Loss(a,b)}{\sigma a}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}})*x_{i})$,必需將所有的 $(x_{i})$ 及 $(y_{i})$全都計算一遍。如果 i 只有幾百或幾千個,那電腦還可應付,但如果有上百萬,上億個點呢,這下就吃不消了。
SGD 就是把 $(\sum)$ 拿掉,然後使用亂數隨便取一個 x 及 y 來計算,這樣原本要加總計算上億個值,變成只要計算一次,就快很多了。當然啦,天下沒有白吃的午餐,如下圖所示,會呈現極不穩定的狀態。
SGD類別如下
from BGD import BGD
import numpy as np
class SGD(BGD):
def __init__(self, a, b, x, y, lr):
super().__init__(a, b, x, y, lr)
def gradient(self):
# 隨機取一筆資料進行計算
idx = np.random.randint(len(self.x))
grad_a = 2 * (self.a * self.x[idx] + self.b - self.y[idx]) * (self.x[idx])
grad_b = 2 * (self.a * self.x[idx] + self.b - self.y[idx])
return grad_a, grad_b
主程式如下
import threading
import time
from Regression import *
import pylab as plt
from SGD import SGD
def runnable():
for i in range(epoch):
gd.update()
a=gd.a
b=gd.b
loss=gd.loss
ax[0].clear()
ax[0].set_xlim(-5,5)
ax[0].set_ylim(-30, 30)
ax[0].scatter(x, y)
ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange")
ax[0].set_title(f'{a:.6f}x+{b:.6f}')
print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss))
ax[1].set_xlim(-10,15)
ax[1].set_ylim(-10, 15)
ax[1].set_title(f'iter:{i:03d} Loss: {loss:6f}')
ax[1].plot([gd.a_old, a], [gd.b_old, b], c='r')
ax[1].scatter(a, b, c='g')
ax[1].set_xlabel("a")
ax[1].set_ylabel("b")
plt.draw()
time.sleep(0.5)
epoch=40
x,y=getData(100)
mesh, loss=getLoss(x,y)
fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))
a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples)
plt.colorbar(a2,ax=ax[1])
lr = 0.01
a = -9; b = -9
ax[1].scatter(a, b, c='g')
gd = SGD(a, b, x, y, lr)
t=threading.Thread(target=runnable)
t.start()
plt.show()
MBGD 小批量梯度下降
MBGD為 mini-batch gradient descent的縮寫。
因為SGD隨便選一個點來計算實在是不準確,所以就隨便選個100個點,或1000個點來計算。由底下的圖式可看出,確實平穩了許多,速度也快很多。MBGD 計算損失時,會產生 np.nan,所以如果遇到 np.nan 時,就中斷。
SGD 實在是上不了台面的,所以一般人家講的 SGD,其實是指 MBGD。
MBGD類別如下
from BGD import BGD import numpy as np class MBGD(BGD): def __init__(self, a, b, x, y, lr, batch_size): super().__init__(a, b, x, y, lr) self.batch_size=batch_size self.suffle=np.random.permutation(len(x)) self.refresh() self.update_batch() def refresh(self): self.suffle=np.random.permutation(len(self.x)) self.idx=0 def update_batch(self): idx = self.suffle[self.idx:self.idx + self.batch_size] self.idx+=self.batch_size self.x_batch = self.x[idx] self.y_batch = self.y[idx] def gradient(self): grad_a = 2 * np.mean((self.a * self.x_batch + self.b - self.y_batch) * (self.x_batch)) grad_b = 2 * np.mean((self.a * self.x_batch + self.b - self.y_batch))
#底下為提取批量到結尾,需重新打亂再重頭提取,避免 loss 為 nan
if self.idx>len(self.x): self.refresh() self.update_batch() return grad_a, grad_b
主程式如下
import threading
import time
from MBGD import MBGD
from Regression import *
import pylab as plt
def runnable():
for i in range(epoch):
gd.update()
a=gd.a
b=gd.b
loss=gd.loss
ax[0].clear()
ax[0].set_xlim(-5,5)
ax[0].set_ylim(-30, 30)
ax[0].scatter(x, y)
ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange")
ax[0].set_title(f'{a:.6f}x+{b:.6f}')
print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss))
ax[1].set_xlim(-10,15)
ax[1].set_ylim(-10, 15)
ax[1].set_title(f'iter:{i+1:03d} Loss: {loss:6f}')
ax[1].plot([gd.a_old, a], [gd.b_old, b], c='r')
ax[1].scatter(a, b, c='g')
ax[1].set_xlabel("a")
ax[1].set_ylabel("b")
plt.draw()
time.sleep(0.5)
epoch=40
x,y=getData(100)
mesh, loss=getLoss(x,y)
fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))
a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples)
plt.colorbar(a2,ax=ax[1])
lr = 0.0
a = -9; b = -9
ax[1].scatter(a, b, c='g')
batch_size=25
gd = MBGD(a, b, x, y, lr, batch_size)
t=threading.Thread(target=runnable)
t.start()
plt.show()
SGD + 動量
上述的SGD (MBGD) 很容易卡在鞍點,所以可以再加入動量。
SGDM類別如下
from MBGD import MBGD class SGDM(MBGD): def __init__(self, a, b, x, y, lr, batch_size, gamma): super().__init__(a, b, x, y, lr, batch_size) self.gamma=gamma self.ma=0 self.mb=0 def update(self): grad_a, grad_b = self.gradient() self.a_old = self.a self.b_old = self.b self.ma = self.gamma * self.ma + self.lr * grad_a self.mb = self.gamma * self.mb + self.lr * grad_b self.a -=self.ma self.b -=self.mb self.loss = self.mse()
主程式如下
import threading import time from SGDM import SGDM from Regression import * import pylab as plt def runnable(): for i in range(epoch): gd.update() a=gd.a b=gd.b loss=gd.loss ax[0].clear() ax[0].set_xlim(-5,5) ax[0].set_ylim(-30, 30) ax[0].scatter(x, y) ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange") ax[0].set_title(f'{a:.6f}x+{b:.6f}') print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss)) ax[1].set_xlim(-10,15) ax[1].set_ylim(-10, 15) ax[1].set_title(f'iter:{i+1:03d} Loss: {loss:6f}') ax[1].plot([gd.a_old, a], [gd.b_old, b], c= 'r') ax[1].scatter(a, b, c='g') ax[1].set_xlabel("a") ax[1].set_ylabel("b") plt.draw() time.sleep(0.5) epoch=40 x,y=getData(100) mesh, loss=getLoss(x,y) fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4)) a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples) plt.colorbar(a2,ax=ax[1]) lr = 0.008 a = -9; b = -9 ax[1].scatter(a, b, c='g') batch_size=5 gamma=0.8 gd = SGDM(a, b, x, y, lr, batch_size, gamma) t=threading.Thread(target=runnable) t.start() plt.show()