常態損失率

      在〈常態損失率〉中留言功能已關閉

BGD 損失率圖解

BGD 是Bitch Gradient Descent (批次梯度下降) 的縮寫。以迴歸線為例,底下是產生迴歸線資料 Regression.py 的代碼

import numpy as np
np.random.seed(1)
def getData(n):
    x = np.arange(-5, 5.1, 10 / n)
    y = 3 * x + 2 + (np.random.rand(len(x)) - 0.5) * 20
    return x, y
def getLoss(xs, ys):
    a=np.arange(-10,16,1)
    b=np.arange(-10,16,1)
    mesh=np.meshgrid(a,b)
    loss=0
    for x, y in zip(xs, ys):
        loss+=((mesh[0]*x+mesh[1])-y)**2
    loss/=len(xs)
    return mesh,loss

BDG的損失函數梯度下降後,
$(\frac{\sigma Loss(a,b)}{\sigma a}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}})*x_{i})$
$(\frac{\sigma Loss(a,b)}{\sigma b}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}}))$

底下是BDG 類別 BDG.py 的代碼

import numpy as np
class BGD():
    def __init__(self, a, b, x, y, lr):
        self.a = a
        self.b = b
        self.x = x
        self.y = y
        self.lr = lr
        self.a_old = a
        self.b_old = b
        self.loss = None;

    # Loss function
    def mse(self):
        loss = ((self.a * self.x + self.b) - self.y) ** 2
        return np.mean(loss)

    def gradient(self):
        grad_a = 2 * np.mean((self.a * self.x + self.b - self.y) * (self.x))
        grad_b = 2 * np.mean((self.a * self.x + self.b - self.y))
        return grad_a, grad_b

    def update(self):
        # 計算梯度
        grad_a, grad_b = self.gradient()

        # 梯度更新
        self.a_old = self.a
        self.b_old = self.b
        self.a = self.a - self.lr * grad_a
        self.b = self.b - self.lr * grad_b
        self.loss = self.mse()

底下是主程式的代碼

#BGD : Batch Gradient Descent : 批量梯度下降
import threading
import time
import pylab as plt
from BGD import BGD
from Regression import *
def runnable():
    for i in range(epoch):
        gd.update()
        a=gd.a
        b=gd.b
        loss=gd.loss
        ax[0].clear()
        ax[0].set_xlim(-5, 5)
        ax[0].set_ylim(-30,30)
        ax[0].scatter(x, y)
        ax[0].plot([x[0], x[-1]], [a*x[0]+b, a*x[-1]+b], c="orange")
        ax[0].set_title(f'{a:.6f}x+{b:.6f}')

        ax[1].set_xlim(-10, 15)
        ax[1].set_ylim(-10, 15)
        ax[1].set_title(f'iter:{i+1:03d}, loss={loss:.6f}')
        ax[1].plot([gd.a_old, a],[gd.b_old,b],c='r')

        ax[1].scatter(a, b, c='g')

        plt.draw()
        time.sleep(0.5)
epoch=50
x,y = getData(100)
mesh, loss=getLoss(x,y)
fig,ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))
a2=ax[1].contourf(mesh[0], mesh[1], loss, 15, cmap=plt.cm.Purples)
ax[1].set_xlabel("a")
ax[1].set_ylabel("b")

plt.colorbar(a2, ax=ax[1])
#lr=0.058
lr = 0.1
a=-9#初始值
b=-9
ax[1].scatter(a, b, c='g')
gd=BGD(a, b, x, y, lr)
t=threading.Thread(target=runnable)
t.start()
plt.show()

SGD 隨機梯度下降

SGD 為 Stochastic [stəˈkæstɪk] Gradient Descent 的縮寫。

BDG 損失函數梯度下降後,公式為$(\frac{\sigma Loss(a,b)}{\sigma a}=\frac{2}{n}\sum_{i=1}^{n}(ax_{i}+b-\tilde{y_{i}})*x_{i})$,必需將所有的 $(x_{i})$ 及 $(y_{i})$全都計算一遍。如果 i 只有幾百或幾千個,那電腦還可應付,但如果有上百萬,上億個點呢,這下就吃不消了。

SGD 就是把 $(\sum)$ 拿掉,然後使用亂數隨便取一個 x 及 y 來計算,這樣原本要加總計算上億個值,變成只要計算一次,就快很多了。當然啦,天下沒有白吃的午餐,如下圖所示,會呈現極不穩定的狀態。

SGD類別如下

from BGD import BGD
import numpy as np
class SGD(BGD):
    def __init__(self, a, b, x, y, lr):
        super().__init__(a, b, x, y, lr)
    def gradient(self):
        # 隨機取一筆資料進行計算
        idx = np.random.randint(len(self.x))
        grad_a = 2 * (self.a * self.x[idx] + self.b - self.y[idx]) * (self.x[idx])
        grad_b = 2 * (self.a * self.x[idx] + self.b - self.y[idx])
        return grad_a, grad_b

主程式如下

import threading
import time
from Regression import *
import pylab as plt
from SGD import SGD
def runnable():
    for i in range(epoch):
        gd.update()
        a=gd.a
        b=gd.b
        loss=gd.loss
        ax[0].clear()
        ax[0].set_xlim(-5,5)
        ax[0].set_ylim(-30, 30)
        ax[0].scatter(x, y)
        ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange")
        ax[0].set_title(f'{a:.6f}x+{b:.6f}')

        print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss))
        ax[1].set_xlim(-10,15)
        ax[1].set_ylim(-10, 15)
        ax[1].set_title(f'iter:{i:03d} Loss: {loss:6f}')
        ax[1].plot([gd.a_old, a], [gd.b_old, b], c='r')
        ax[1].scatter(a, b, c='g')
        ax[1].set_xlabel("a")
        ax[1].set_ylabel("b")
        plt.draw()
        time.sleep(0.5)
epoch=40
x,y=getData(100)
mesh, loss=getLoss(x,y)
fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))

a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples)
plt.colorbar(a2,ax=ax[1])

lr = 0.01
a = -9; b = -9
 ax[1].scatter(a, b, c='g')
gd = SGD(a, b, x, y, lr)
t=threading.Thread(target=runnable)
t.start()
plt.show()

MBGD 小批量梯度下降

MBGD為 mini-batch gradient descent的縮寫。

因為SGD隨便選一個點來計算實在是不準確,所以就隨便選個100個點,或1000個點來計算。由底下的圖式可看出,確實平穩了許多,速度也快很多。MBGD 計算損失時,會產生 np.nan,所以如果遇到 np.nan 時,就中斷。

SGD 實在是上不了台面的,所以一般人家講的 SGD,其實是指 MBGD。

MBGD類別如下

from BGD import BGD
import numpy as np
class MBGD(BGD):
    def __init__(self, a, b, x, y, lr, batch_size):
        super().__init__(a, b, x, y, lr)
        self.batch_size=batch_size
        self.suffle=np.random.permutation(len(x))
        self.refresh()
        self.update_batch()
    def refresh(self):
        self.suffle=np.random.permutation(len(self.x))
        self.idx=0
    def update_batch(self):
        idx = self.suffle[self.idx:self.idx + self.batch_size]
        self.idx+=self.batch_size
        self.x_batch = self.x[idx]
        self.y_batch = self.y[idx]
    def gradient(self):
        grad_a = 2 * np.mean((self.a * self.x_batch + self.b - self.y_batch) * (self.x_batch))
        grad_b = 2 * np.mean((self.a * self.x_batch + self.b - self.y_batch))

#底下為提取批量到結尾,需重新打亂再重頭提取,避免 loss 為 nan
if self.idx>len(self.x): self.refresh()
self.update_batch()
return grad_a, grad_b

主程式如下

import threading
import time
from MBGD import MBGD
from Regression import *
import pylab as plt
def runnable():
    for i in range(epoch):
        gd.update()
        a=gd.a
        b=gd.b
        loss=gd.loss
        ax[0].clear()
        ax[0].set_xlim(-5,5)
        ax[0].set_ylim(-30, 30)
        ax[0].scatter(x, y)
        ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange")
        ax[0].set_title(f'{a:.6f}x+{b:.6f}')

        print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss))
        ax[1].set_xlim(-10,15)
        ax[1].set_ylim(-10, 15)
        ax[1].set_title(f'iter:{i+1:03d} Loss: {loss:6f}')
        ax[1].plot([gd.a_old, a], [gd.b_old, b], c='r')
        ax[1].scatter(a, b, c='g')
        ax[1].set_xlabel("a")
        ax[1].set_ylabel("b")
        plt.draw()
        time.sleep(0.5)
epoch=40
x,y=getData(100)
mesh, loss=getLoss(x,y)
fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))

a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples)
plt.colorbar(a2,ax=ax[1])

lr = 0.0
a = -9; b = -9
ax[1].scatter(a, b, c='g')
batch_size=25
gd = MBGD(a, b, x, y, lr, batch_size)
t=threading.Thread(target=runnable)
t.start()
plt.show()

SGD + 動量

上述的SGD (MBGD) 很容易卡在鞍點,所以可以再加入動量。

SGDM類別如下

from MBGD import MBGD
class SGDM(MBGD):
    def __init__(self, a, b, x, y, lr, batch_size, gamma):
        super().__init__(a, b, x, y, lr, batch_size)
        self.gamma=gamma
        self.ma=0
        self.mb=0
    def update(self):
        grad_a, grad_b = self.gradient()
        self.a_old = self.a
        self.b_old = self.b
        self.ma = self.gamma * self.ma + self.lr * grad_a
        self.mb = self.gamma * self.mb + self.lr * grad_b
        self.a -=self.ma
        self.b -=self.mb
        self.loss = self.mse()

主程式如下

import threading
import time
from SGDM import SGDM
from Regression import *
import pylab as plt
def runnable():
    for i in range(epoch):
        gd.update()
        a=gd.a
        b=gd.b
        loss=gd.loss
        ax[0].clear()
        ax[0].set_xlim(-5,5)
        ax[0].set_ylim(-30, 30)
        ax[0].scatter(x, y)
        ax[0].plot([x[0], x[-1]],[a*x[0]+b, a*x[-1]+b], c="orange")
        ax[0].set_title(f'{a:.6f}x+{b:.6f}')

        print('iter=' + str(i) + ', loss=' + '{:.2f}'.format(gd.loss))
        ax[1].set_xlim(-10,15)
        ax[1].set_ylim(-10, 15)
        ax[1].set_title(f'iter:{i+1:03d} Loss: {loss:6f}')
        ax[1].plot([gd.a_old, a], [gd.b_old, b],  c= 'r')
        ax[1].scatter(a, b, c='g')
        ax[1].set_xlabel("a")
        ax[1].set_ylabel("b")
        plt.draw()
        time.sleep(0.5)
epoch=40
x,y=getData(100)
mesh, loss=getLoss(x,y)
fig, ax=plt.subplots(nrows=1, ncols=2, figsize=(12,4))

a2=ax[1].contourf(mesh[0], mesh[1], loss,15, cmap=plt.cm.Purples)
plt.colorbar(a2,ax=ax[1])

lr = 0.008
a = -9; b = -9
ax[1].scatter(a, b, c='g')
batch_size=5
gamma=0.8
gd = SGDM(a, b, x, y, lr, batch_size, gamma)
t=threading.Thread(target=runnable)
t.start()
plt.show()