机器学习 August 20, 2018

5-1 多元线性回归

Words count 11k Reading time 10 mins. Read count 0



具体的编程实现

import numpy as np
from sklearn.metrics import r2_score


class LinearRegression:
    def __init__(self):
        """初始化Linear Regression模型"""
        self.coef_ = None  # 系数
        self.intercept_ = None  # 截距
        self._theta = None

    def fit_normal(self, X_train, y_train):
        """根据训练数据集X_train,y_train训练Linear Regression模型"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        """
        X_b就是所给X_train向量前面再加一列,这一列所有值都为1
        利用np.ones()生成值全=1的向量,行数为len(X_train)列数为1
        然后将生成的向量与X_train利用hstack进行水平合并
        """
        X_b = np.hstack([np.ones((len(X_train))), X_train])

        """
        利用公式求出_theta
        求矩阵的逆矩阵的方法:利用np.linalg.inv()
        """
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)

        """
        获取截距:_theta的第一个元素
        """
        self.intercept_ = self._theta(0)
        """
        获取系数:_theta从第一个到最后的所有元素
        """
        self.coef_ = self._theta[1:]

        return self

    def predict(self, X_predict):
        """给定预测数据集X_predict,返回表示X_predict的结果向量"""
        assert self.intercept_ is not None and self.coef_ is not None, \
            """"must fit before predict"""
        assert X_predict.shape[1] == len(self.coef_), \
            "the feature number of X_predict of X_predcit must be equal to X_train"
        X_b = np.hstack([np.ones((len(X_predict))), X_predict])
        return X_b.dot(self._theta)

    def socre(self, X_test, y_test):
        y_predict = self.predict(X_test)
        return r2_score(y_test, y_predict)

    def __repr__(self):
        return "LinearRegression()"

实现多元线性回归模型

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
# 读取波士顿房产数据
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]
X.shape
(490, 13)
y.shape
(490,)
#对数据进行拆分
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test = train_test_split(X,y)
# 使用自己的linearRegression
from script.LinearRegression import LinearRegression
reg = LinearRegression()
reg.fit_normal(X_train,y_train)
LinearRegression()
reg.coef_
array([-1.15467813e-01,  3.92880958e-02, -3.41748487e-02,  7.47995516e-01,
       -1.10513264e+01,  3.32719731e+00, -1.84801883e-02, -1.15846165e+00,
        2.23328413e-01, -1.19884387e-02, -7.98506487e-01,  8.97887470e-03,
       -3.90390966e-01])
reg.intercept_
32.40591102447736
reg.score(x_test,y_test)
0.7727301788951539

scikit-learn中的回归问题

from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test = train_test_split(X,y)
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
lin_reg.coef_
array([ -0.10279992,   0.03681912,  -0.04638299,   0.5490078 ,
       -10.33884868,   3.72440973,  -0.02280334,  -1.13176363,
         0.30092941,  -0.01554197,  -0.7849784 ,   0.01055632,
        -0.37071212])
lin_reg.intercept_
29.400432014797325
lin_reg.score(x_test,y_test)
0.7716499667746084

kNN Regressor

from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train,y_train)
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')
knn_reg.score(x_test,y_test)
0.5960534303527143
# 超参数的选择

from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        "weights":["uniform"],
        "n_neighbors":[i for i in range(1,11)]
    },
    {
        "weights":['distance'],
        "n_neighbors":[i for i in range(1,11)],
        "p":[i for i in range(1,6)]
    }
]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg,param_grid,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)
Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.6s finished





GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)
grid_search.best_params_
{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
grid_search.best_score_
0.6046509483167782
grid_search.best_estimator_.score(x_test,y_test)
0.6664759945985371

更多关于线性回归模型的讨论

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X,y)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
lin_reg.coef_
array([-1.05574295e-01,  3.52748549e-02, -4.35179251e-02,  4.55405227e-01,
       -1.24268073e+01,  3.75411229e+00, -2.36116881e-02, -1.21088069e+00,
        2.50740082e-01, -1.37702943e-02, -8.38888137e-01,  7.93577159e-03,
       -3.50952134e-01])
# 排序从小到大,得到索引
np.argsort(lin_reg.coef_)
array([ 4,  7, 10, 12,  0,  2,  6,  9, 11,  1,  8,  3,  5])
boston.feature_names[np.argsort(lin_reg.coef_)]
array(['NOX', 'DIS', 'PTRATIO', 'LSTAT', 'CRIM', 'INDUS', 'AGE', 'TAX',
       'B', 'ZN', 'RAD', 'CHAS', 'RM'], dtype='<U7')
print(boston.DESCR)
Boston House Prices dataset
===========================

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
**References**

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)

0%