机器学习 August 20, 2018

2-5 数据归一化

Words count 16k Reading time 15 mins. Read count 0

解决方案:将所有的数据映射到同一尺度

最值归一化:normalization

把所有数据映射到0-1之间

均值方差归一化 standardization

把所有数据归一到均值为0方差为1的分布中

适用于:数据分布没有明显的边界;有可能存在极端数据值

数据归一化 Normalization

import numpy as np
import matplotlib.pyplot as plt
x = np.random.randint(0,100,size=100)
x
array([88, 64, 34, 56, 68, 54, 19, 45, 53, 34, 18, 34, 21, 35,  2, 94, 61,
       31, 93, 78, 34, 96, 86, 57, 89, 50, 57, 34, 31, 16, 63,  8, 81, 10,
       74, 93, 94, 58, 93, 73,  5, 10, 90, 29, 73, 92,  1, 44, 79, 41, 16,
       83, 57, 83, 21, 20, 26, 95, 80,  2,  1, 77, 92, 22, 44, 92, 70, 49,
       19, 99, 86, 99, 52, 20,  0, 11, 83, 62,  2, 81, 95, 18, 90, 99, 43,
       12, 47, 91, 78, 31, 12, 46, 43, 92,  6, 38, 22, 32, 89, 62])
(x -np.min(x))/(np.max(x)-np.min(x))
array([0.88888889, 0.64646465, 0.34343434, 0.56565657, 0.68686869,
       0.54545455, 0.19191919, 0.45454545, 0.53535354, 0.34343434,
       0.18181818, 0.34343434, 0.21212121, 0.35353535, 0.02020202,
       0.94949495, 0.61616162, 0.31313131, 0.93939394, 0.78787879,
       0.34343434, 0.96969697, 0.86868687, 0.57575758, 0.8989899 ,
       0.50505051, 0.57575758, 0.34343434, 0.31313131, 0.16161616,
       0.63636364, 0.08080808, 0.81818182, 0.1010101 , 0.74747475,
       0.93939394, 0.94949495, 0.58585859, 0.93939394, 0.73737374,
       0.05050505, 0.1010101 , 0.90909091, 0.29292929, 0.73737374,
       0.92929293, 0.01010101, 0.44444444, 0.7979798 , 0.41414141,
       0.16161616, 0.83838384, 0.57575758, 0.83838384, 0.21212121,
       0.2020202 , 0.26262626, 0.95959596, 0.80808081, 0.02020202,
       0.01010101, 0.77777778, 0.92929293, 0.22222222, 0.44444444,
       0.92929293, 0.70707071, 0.49494949, 0.19191919, 1.        ,
       0.86868687, 1.        , 0.52525253, 0.2020202 , 0.        ,
       0.11111111, 0.83838384, 0.62626263, 0.02020202, 0.81818182,
       0.95959596, 0.18181818, 0.90909091, 1.        , 0.43434343,
       0.12121212, 0.47474747, 0.91919192, 0.78787879, 0.31313131,
       0.12121212, 0.46464646, 0.43434343, 0.92929293, 0.06060606,
       0.38383838, 0.22222222, 0.32323232, 0.8989899 , 0.62626263])

均值方差归一化 Standardization

X2 = np.random.randint(0,100,(50,2))
X2
array([[93, 86],
       [39, 54],
       [89, 31],
       [ 4,  3],
       [83, 10],
       [81, 26],
       [64, 14],
       [29, 15],
       [58,  0],
       [23, 39],
       [48, 36],
       [40, 26],
       [18,  4],
       [61, 97],
       [28, 35],
       [20,  4],
       [56,  0],
       [60, 87],
       [85, 66],
       [81, 63],
       [84, 15],
       [98,  2],
       [34, 45],
       [78, 30],
       [37, 49],
       [96,  6],
       [81, 86],
       [ 6, 98],
       [88, 86],
       [74, 12],
       [64, 98],
       [ 6, 28],
       [47, 36],
       [50, 33],
       [65, 92],
       [36, 77],
       [67, 78],
       [66, 86],
       [37, 13],
       [46, 11],
       [81, 45],
       [46, 78],
       [71, 79],
       [97,  4],
       [88, 19],
       [32, 33],
       [12, 44],
       [76, 30],
       [ 5, 49],
       [41, 35]])
x2 = np.array(X2,dtype=float)
X2[:,0] = (X2[:,0]-np.mean(X2[:,0]))/np.std(X2[:,0])
X2
array([[ 1, 86],
       [ 0, 54],
       [ 1, 31],
       [-1,  3],
       [ 1, 10],
       [ 0, 26],
       [ 0, 14],
       [ 0, 15],
       [ 0,  0],
       [-1, 39],
       [ 0, 36],
       [ 0, 26],
       [-1,  4],
       [ 0, 97],
       [-1, 35],
       [-1,  4],
       [ 0,  0],
       [ 0, 87],
       [ 1, 66],
       [ 0, 63],
       [ 1, 15],
       [ 1,  2],
       [ 0, 45],
       [ 0, 30],
       [ 0, 49],
       [ 1,  6],
       [ 0, 86],
       [-1, 98],
       [ 1, 86],
       [ 0, 12],
       [ 0, 98],
       [-1, 28],
       [ 0, 36],
       [ 0, 33],
       [ 0, 92],
       [ 0, 77],
       [ 0, 78],
       [ 0, 86],
       [ 0, 13],
       [ 0, 11],
       [ 0, 45],
       [ 0, 78],
       [ 0, 79],
       [ 1,  4],
       [ 1, 19],
       [ 0, 33],
       [-1, 44],
       [ 0, 30],
       [-1, 49],
       [ 0, 35]])
X2[:,1] = (X2[:,1]-np.mean(X2[:,1]))/np.std(X2[:,1])
X2
array([[ 1,  1],
       [ 0,  0],
       [ 1,  0],
       [-1, -1],
       [ 1, -1],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0, -1],
       [-1,  0],
       [ 0,  0],
       [ 0,  0],
       [-1, -1],
       [ 0,  1],
       [-1,  0],
       [-1, -1],
       [ 0, -1],
       [ 0,  1],
       [ 1,  0],
       [ 0,  0],
       [ 1,  0],
       [ 1, -1],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 1, -1],
       [ 0,  1],
       [-1,  1],
       [ 1,  1],
       [ 0,  0],
       [ 0,  1],
       [-1,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  1],
       [ 0,  1],
       [ 0,  1],
       [ 0,  1],
       [ 0,  0],
       [ 0,  0],
       [ 0,  0],
       [ 0,  1],
       [ 0,  1],
       [ 1, -1],
       [ 1,  0],
       [ 0,  0],
       [-1,  0],
       [ 0,  0],
       [-1,  0],
       [ 0,  0]])
plt.scatter(X2[:,0],X2[:,1])
plt.show()

对测试数据如何归一化

要保存训练数据集得到的均值和方差

Sclikit-learn中的Scalar

from sklearn import datasets
# 获取燕尾花数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 前十行内容
X[:10,:]
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])
# train_test_split
from sklearn.model_selection import train_test_split
X_train,X_text,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=666)

scikit-learn中的StandardScaler

from sklearn.preprocessing import StandardScaler
standarScaler = StandardScaler()
standarScaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
standarScaler.mean_
array([5.83416667, 3.0825    , 3.70916667, 1.16916667])
standarScaler.scale_
array([0.81019502, 0.44076874, 1.76295187, 0.75429833])
X_train = standarScaler.transform(X_train)
X_train
array([[-0.90616043,  0.94720873, -1.30982967, -1.28485856],
       [-1.15301457, -0.18717298, -1.30982967, -1.28485856],
       [-0.16559799, -0.64092567,  0.22169257,  0.17345038],
       [ 0.45153738,  0.72033239,  0.95909217,  1.49918578],
       [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],
       [ 1.43895396,  0.2665797 ,  0.56203085,  0.30602392],
       [ 0.3281103 , -1.09467835,  1.07253826,  0.30602392],
       [ 2.1795164 , -0.18717298,  1.63976872,  1.2340387 ],
       [-0.78273335,  2.30846679, -1.25310662, -1.4174321 ],
       [ 0.45153738, -2.00218372,  0.44858475,  0.43859746],
       [ 1.80923518, -0.41404933,  1.46959958,  0.83631808],
       [ 0.69839152,  0.2665797 ,  0.90236912,  1.49918578],
       [ 0.20468323,  0.72033239,  0.44858475,  0.571171  ],
       [-0.78273335, -0.86780201,  0.10824648,  0.30602392],
       [-0.53587921,  1.40096142, -1.25310662, -1.28485856],
       [-0.65930628,  1.40096142, -1.25310662, -1.28485856],
       [-1.0295875 ,  0.94720873, -1.19638358, -0.7545644 ],
       [-1.77014994, -0.41404933, -1.30982967, -1.28485856],
       [-0.04217092, -0.86780201,  0.10824648,  0.04087684],
       [-0.78273335,  0.72033239, -1.30982967, -1.28485856],
       [-1.52329579,  0.72033239, -1.30982967, -1.15228502],
       [ 0.82181859,  0.2665797 ,  0.78892303,  1.10146516],
       [-0.16559799, -0.41404933,  0.27841562,  0.17345038],
       [ 0.94524567, -0.18717298,  0.39186171,  0.30602392],
       [ 0.20468323, -0.41404933,  0.44858475,  0.43859746],
       [-1.39986872,  0.2665797 , -1.19638358, -1.28485856],
       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],
       [ 1.06867274,  0.03970336,  1.07253826,  1.63175932],
       [ 0.57496445, -0.86780201,  0.67547694,  0.83631808],
       [ 0.3281103 , -0.64092567,  0.56203085,  0.04087684],
       [ 0.45153738, -0.64092567,  0.61875389,  0.83631808],
       [-0.16559799,  2.98909581, -1.25310662, -1.01971148],
       [ 0.57496445, -1.3215547 ,  0.67547694,  0.43859746],
       [ 0.69839152, -0.41404933,  0.33513866,  0.17345038],
       [-0.90616043,  1.62783776, -1.02621444, -1.01971148],
       [ 1.19209981, -0.64092567,  0.61875389,  0.30602392],
       [-0.90616043,  0.94720873, -1.30982967, -1.15228502],
       [-1.89357701, -0.18717298, -1.47999881, -1.4174321 ],
       [ 0.08125616, -0.18717298,  0.78892303,  0.83631808],
       [ 0.69839152, -0.64092567,  1.07253826,  1.2340387 ],
       [-0.28902506, -0.64092567,  0.67547694,  1.10146516],
       [-0.41245214, -1.54843104, -0.00519961, -0.22427024],
       [ 1.31552689,  0.03970336,  0.67547694,  0.43859746],
       [ 0.57496445,  0.72033239,  1.07253826,  1.63175932],
       [ 0.82181859, -0.18717298,  1.18598435,  1.36661224],
       [-0.16559799,  1.62783776, -1.13966053, -1.15228502],
       [ 0.94524567, -0.41404933,  0.5053078 ,  0.17345038],
       [ 1.06867274,  0.49345605,  1.12926131,  1.76433286],
       [-1.27644165, -0.18717298, -1.30982967, -1.4174321 ],
       [-1.0295875 ,  1.17408507, -1.30982967, -1.28485856],
       [ 0.20468323, -0.18717298,  0.61875389,  0.83631808],
       [-1.0295875 , -0.18717298, -1.19638358, -1.28485856],
       [ 0.3281103 , -0.18717298,  0.67547694,  0.83631808],
       [ 0.69839152,  0.03970336,  1.01581521,  0.83631808],
       [-0.90616043,  1.40096142, -1.25310662, -1.01971148],
       [-0.16559799, -0.18717298,  0.27841562,  0.04087684],
       [-1.0295875 ,  0.94720873, -1.36655271, -1.15228502],
       [-0.90616043,  1.62783776, -1.25310662, -1.15228502],
       [-1.52329579,  0.2665797 , -1.30982967, -1.28485856],
       [-0.53587921, -0.18717298,  0.44858475,  0.43859746],
       [ 0.82181859, -0.64092567,  0.5053078 ,  0.43859746],
       [ 0.3281103 , -0.64092567,  0.16496953,  0.17345038],
       [-1.27644165,  0.72033239, -1.19638358, -1.28485856],
       [-0.90616043,  0.49345605, -1.13966053, -0.88713794],
       [-0.04217092, -0.86780201,  0.78892303,  0.96889162],
       [-0.28902506, -0.18717298,  0.22169257,  0.17345038],
       [ 0.57496445, -0.64092567,  0.78892303,  0.43859746],
       [ 1.06867274,  0.49345605,  1.12926131,  1.2340387 ],
       [ 1.68580811, -0.18717298,  1.18598435,  0.571171  ],
       [ 1.06867274, -0.18717298,  0.84564608,  1.49918578],
       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],
       [-1.15301457, -1.3215547 ,  0.44858475,  0.70374454],
       [-0.16559799, -1.3215547 ,  0.73219998,  1.10146516],
       [-1.15301457, -1.54843104, -0.2320918 , -0.22427024],
       [-0.41245214, -1.54843104,  0.05152343, -0.0916967 ],
       [ 1.06867274, -1.3215547 ,  1.18598435,  0.83631808],
       [ 0.82181859, -0.18717298,  1.01581521,  0.83631808],
       [-0.16559799, -1.09467835, -0.1186457 , -0.22427024],
       [ 0.20468323, -2.00218372,  0.73219998,  0.43859746],
       [ 1.06867274,  0.03970336,  0.56203085,  0.43859746],
       [-1.15301457,  0.03970336, -1.25310662, -1.4174321 ],
       [ 0.57496445, -1.3215547 ,  0.73219998,  0.96889162],
       [-1.39986872,  0.2665797 , -1.36655271, -1.28485856],
       [ 0.20468323, -0.86780201,  0.78892303,  0.571171  ],
       [-0.04217092, -1.09467835,  0.16496953,  0.04087684],
       [ 1.31552689,  0.2665797 ,  1.12926131,  1.49918578],
       [-1.77014994, -0.18717298, -1.36655271, -1.28485856],
       [ 1.56238103, -0.18717298,  1.2427074 ,  1.2340387 ],
       [ 1.19209981,  0.2665797 ,  1.2427074 ,  1.49918578],
       [-0.78273335,  0.94720873, -1.25310662, -1.28485856],
       [ 2.54979762,  1.62783776,  1.52632263,  1.10146516],
       [ 0.69839152, -0.64092567,  1.07253826,  1.36661224],
       [-0.28902506, -0.41404933, -0.06192266,  0.17345038],
       [-0.41245214,  2.53534313, -1.30982967, -1.28485856],
       [-1.27644165, -0.18717298, -1.30982967, -1.15228502],
       [ 0.57496445, -0.41404933,  1.07253826,  0.83631808],
       [-1.77014994,  0.2665797 , -1.36655271, -1.28485856],
       [-0.53587921,  1.8547141 , -1.13966053, -1.01971148],
       [-1.0295875 ,  0.72033239, -1.19638358, -1.01971148],
       [ 1.06867274, -0.18717298,  0.73219998,  0.70374454],
       [-0.53587921,  1.8547141 , -1.36655271, -1.01971148],
       [ 2.30294347, -0.64092567,  1.69649176,  1.10146516],
       [-0.28902506, -0.86780201,  0.27841562,  0.17345038],
       [ 1.19209981, -0.18717298,  1.01581521,  1.2340387 ],
       [-0.41245214,  0.94720873, -1.36655271, -1.28485856],
       [-1.27644165,  0.72033239, -1.02621444, -1.28485856],
       [-0.53587921,  0.72033239, -1.13966053, -1.28485856],
       [ 2.30294347,  1.62783776,  1.69649176,  1.36661224],
       [ 1.31552689,  0.03970336,  0.95909217,  1.2340387 ],
       [-0.28902506, -1.3215547 ,  0.10824648, -0.0916967 ],
       [-0.90616043,  0.72033239, -1.25310662, -1.28485856],
       [-0.90616043,  1.62783776, -1.19638358, -1.28485856],
       [ 0.3281103 , -0.41404933,  0.56203085,  0.30602392],
       [-0.04217092,  2.08159044, -1.42327576, -1.28485856],
       [-1.0295875 , -2.45593641, -0.1186457 , -0.22427024],
       [ 0.69839152,  0.2665797 ,  0.44858475,  0.43859746],
       [ 0.3281103 , -0.18717298,  0.5053078 ,  0.30602392],
       [ 0.08125616,  0.2665797 ,  0.61875389,  0.83631808],
       [ 0.20468323, -2.00218372,  0.16496953, -0.22427024],
       [ 1.93266225, -0.64092567,  1.35615349,  0.96889162]])
X_test =standarScaler.transform(X_text)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
knn_clf.score(X_test,y_test)
1.0
0%