机器学习 August 20, 2018

4-2 梯度下降法的总结以及相关讨论

Words count 3.2k Reading time 3 mins. Read count 0

如何调试梯度

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(666)
X = np.random.random(size=(1000,10))
true_theta = np.arange(1,12,dtype=float)
X_b = np.hstack([np.ones((len(X),1)),X])
y = X_b.dot(true_theta) + np.random.normal(size=1000)
X.shape
(1000, 10)
y_shape
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-6-52b6fec998f8> in <module>()
----> 1 y_shape


NameError: name 'y_shape' is not defined
true_theta
array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.])
def J(theta,X_b,y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(X_b)
    except:
        return float('inf')
def dJ_math(theta,X_b,y):
    return X_b.T.dot(X_b.dot(theta)-y)*2. /len(y)
def dJ_debug(theta,X_b,y,epsilon=0.01):
    res = np.empty(len(theta))
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1,X_b,y) - J(theta_2,X_b,y)) / (2 * epsilon)
    return res    
        
def gradient_descent(dJ,X_b,y,initial_theta,eta,n_iters = 1e4,epsilon=1e-8):
    theta = initial_theta
    i_iter = 0
    while i_iter < n_iters:
        gradient = dJ(theta,X_b,y)
        last_theta = theta
        theta = theta - eta * gradient
        if(abs(J(theta,X_b,y)-J(last_theta,X_b,y)) < epsilon):
            break
        i_iter +=1 
    return theta    
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

%time theta = gradient_descent(dJ_debug,X_b,y,initial_theta,eta)
theta
CPU times: user 7.72 s, sys: 55 ms, total: 7.77 s
Wall time: 3.98 s





array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])
%time theta2 = gradient_descent(dJ_math,X_b,y,initial_theta,eta)
theta
CPU times: user 1.15 s, sys: 13.3 ms, total: 1.17 s
Wall time: 611 ms





array([ 1.1251597 ,  2.05312521,  2.91522497,  4.11895968,  5.05002117,
        5.90494046,  6.97383745,  8.00088367,  8.86213468,  9.98608331,
       10.90529198])

梯度下降法总结

0%