解决方案:将所有的数据映射到同一尺度
最值归一化:normalization
把所有数据映射到0-1之间
均值方差归一化 standardization
把所有数据归一到均值为0方差为1的分布中
适用于:数据分布没有明显的边界;有可能存在极端数据值
数据归一化 Normalization
import numpy as np
import matplotlib.pyplot as plt
x = np.random.randint(0,100,size=100)
x
array([88, 64, 34, 56, 68, 54, 19, 45, 53, 34, 18, 34, 21, 35, 2, 94, 61,
31, 93, 78, 34, 96, 86, 57, 89, 50, 57, 34, 31, 16, 63, 8, 81, 10,
74, 93, 94, 58, 93, 73, 5, 10, 90, 29, 73, 92, 1, 44, 79, 41, 16,
83, 57, 83, 21, 20, 26, 95, 80, 2, 1, 77, 92, 22, 44, 92, 70, 49,
19, 99, 86, 99, 52, 20, 0, 11, 83, 62, 2, 81, 95, 18, 90, 99, 43,
12, 47, 91, 78, 31, 12, 46, 43, 92, 6, 38, 22, 32, 89, 62])
(x -np.min(x))/(np.max(x)-np.min(x))
array([0.88888889, 0.64646465, 0.34343434, 0.56565657, 0.68686869,
0.54545455, 0.19191919, 0.45454545, 0.53535354, 0.34343434,
0.18181818, 0.34343434, 0.21212121, 0.35353535, 0.02020202,
0.94949495, 0.61616162, 0.31313131, 0.93939394, 0.78787879,
0.34343434, 0.96969697, 0.86868687, 0.57575758, 0.8989899 ,
0.50505051, 0.57575758, 0.34343434, 0.31313131, 0.16161616,
0.63636364, 0.08080808, 0.81818182, 0.1010101 , 0.74747475,
0.93939394, 0.94949495, 0.58585859, 0.93939394, 0.73737374,
0.05050505, 0.1010101 , 0.90909091, 0.29292929, 0.73737374,
0.92929293, 0.01010101, 0.44444444, 0.7979798 , 0.41414141,
0.16161616, 0.83838384, 0.57575758, 0.83838384, 0.21212121,
0.2020202 , 0.26262626, 0.95959596, 0.80808081, 0.02020202,
0.01010101, 0.77777778, 0.92929293, 0.22222222, 0.44444444,
0.92929293, 0.70707071, 0.49494949, 0.19191919, 1. ,
0.86868687, 1. , 0.52525253, 0.2020202 , 0. ,
0.11111111, 0.83838384, 0.62626263, 0.02020202, 0.81818182,
0.95959596, 0.18181818, 0.90909091, 1. , 0.43434343,
0.12121212, 0.47474747, 0.91919192, 0.78787879, 0.31313131,
0.12121212, 0.46464646, 0.43434343, 0.92929293, 0.06060606,
0.38383838, 0.22222222, 0.32323232, 0.8989899 , 0.62626263])
均值方差归一化 Standardization
X2 = np.random.randint(0,100,(50,2))
X2
array([[93, 86],
[39, 54],
[89, 31],
[ 4, 3],
[83, 10],
[81, 26],
[64, 14],
[29, 15],
[58, 0],
[23, 39],
[48, 36],
[40, 26],
[18, 4],
[61, 97],
[28, 35],
[20, 4],
[56, 0],
[60, 87],
[85, 66],
[81, 63],
[84, 15],
[98, 2],
[34, 45],
[78, 30],
[37, 49],
[96, 6],
[81, 86],
[ 6, 98],
[88, 86],
[74, 12],
[64, 98],
[ 6, 28],
[47, 36],
[50, 33],
[65, 92],
[36, 77],
[67, 78],
[66, 86],
[37, 13],
[46, 11],
[81, 45],
[46, 78],
[71, 79],
[97, 4],
[88, 19],
[32, 33],
[12, 44],
[76, 30],
[ 5, 49],
[41, 35]])
x2 = np.array(X2,dtype=float)
X2[:,0] = (X2[:,0]-np.mean(X2[:,0]))/np.std(X2[:,0])
X2
array([[ 1, 86],
[ 0, 54],
[ 1, 31],
[-1, 3],
[ 1, 10],
[ 0, 26],
[ 0, 14],
[ 0, 15],
[ 0, 0],
[-1, 39],
[ 0, 36],
[ 0, 26],
[-1, 4],
[ 0, 97],
[-1, 35],
[-1, 4],
[ 0, 0],
[ 0, 87],
[ 1, 66],
[ 0, 63],
[ 1, 15],
[ 1, 2],
[ 0, 45],
[ 0, 30],
[ 0, 49],
[ 1, 6],
[ 0, 86],
[-1, 98],
[ 1, 86],
[ 0, 12],
[ 0, 98],
[-1, 28],
[ 0, 36],
[ 0, 33],
[ 0, 92],
[ 0, 77],
[ 0, 78],
[ 0, 86],
[ 0, 13],
[ 0, 11],
[ 0, 45],
[ 0, 78],
[ 0, 79],
[ 1, 4],
[ 1, 19],
[ 0, 33],
[-1, 44],
[ 0, 30],
[-1, 49],
[ 0, 35]])
X2[:,1] = (X2[:,1]-np.mean(X2[:,1]))/np.std(X2[:,1])
X2
array([[ 1, 1],
[ 0, 0],
[ 1, 0],
[-1, -1],
[ 1, -1],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, -1],
[-1, 0],
[ 0, 0],
[ 0, 0],
[-1, -1],
[ 0, 1],
[-1, 0],
[-1, -1],
[ 0, -1],
[ 0, 1],
[ 1, 0],
[ 0, 0],
[ 1, 0],
[ 1, -1],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 1, -1],
[ 0, 1],
[-1, 1],
[ 1, 1],
[ 0, 0],
[ 0, 1],
[-1, 0],
[ 0, 0],
[ 0, 0],
[ 0, 1],
[ 0, 1],
[ 0, 1],
[ 0, 1],
[ 0, 0],
[ 0, 0],
[ 0, 0],
[ 0, 1],
[ 0, 1],
[ 1, -1],
[ 1, 0],
[ 0, 0],
[-1, 0],
[ 0, 0],
[-1, 0],
[ 0, 0]])
plt.scatter(X2[:,0],X2[:,1])
plt.show()
对测试数据如何归一化
要保存训练数据集得到的均值和方差
Sclikit-learn中的Scalar
from sklearn import datasets
# 获取燕尾花数据
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 前十行内容
X[:10,:]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1]])
# train_test_split
from sklearn.model_selection import train_test_split
X_train,X_text,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=666)
scikit-learn中的StandardScaler
from sklearn.preprocessing import StandardScaler
standarScaler = StandardScaler()
standarScaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
standarScaler.mean_
array([5.83416667, 3.0825 , 3.70916667, 1.16916667])
standarScaler.scale_
array([0.81019502, 0.44076874, 1.76295187, 0.75429833])
X_train = standarScaler.transform(X_train)
X_train
array([[-0.90616043, 0.94720873, -1.30982967, -1.28485856],
[-1.15301457, -0.18717298, -1.30982967, -1.28485856],
[-0.16559799, -0.64092567, 0.22169257, 0.17345038],
[ 0.45153738, 0.72033239, 0.95909217, 1.49918578],
[-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],
[ 1.43895396, 0.2665797 , 0.56203085, 0.30602392],
[ 0.3281103 , -1.09467835, 1.07253826, 0.30602392],
[ 2.1795164 , -0.18717298, 1.63976872, 1.2340387 ],
[-0.78273335, 2.30846679, -1.25310662, -1.4174321 ],
[ 0.45153738, -2.00218372, 0.44858475, 0.43859746],
[ 1.80923518, -0.41404933, 1.46959958, 0.83631808],
[ 0.69839152, 0.2665797 , 0.90236912, 1.49918578],
[ 0.20468323, 0.72033239, 0.44858475, 0.571171 ],
[-0.78273335, -0.86780201, 0.10824648, 0.30602392],
[-0.53587921, 1.40096142, -1.25310662, -1.28485856],
[-0.65930628, 1.40096142, -1.25310662, -1.28485856],
[-1.0295875 , 0.94720873, -1.19638358, -0.7545644 ],
[-1.77014994, -0.41404933, -1.30982967, -1.28485856],
[-0.04217092, -0.86780201, 0.10824648, 0.04087684],
[-0.78273335, 0.72033239, -1.30982967, -1.28485856],
[-1.52329579, 0.72033239, -1.30982967, -1.15228502],
[ 0.82181859, 0.2665797 , 0.78892303, 1.10146516],
[-0.16559799, -0.41404933, 0.27841562, 0.17345038],
[ 0.94524567, -0.18717298, 0.39186171, 0.30602392],
[ 0.20468323, -0.41404933, 0.44858475, 0.43859746],
[-1.39986872, 0.2665797 , -1.19638358, -1.28485856],
[-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],
[ 1.06867274, 0.03970336, 1.07253826, 1.63175932],
[ 0.57496445, -0.86780201, 0.67547694, 0.83631808],
[ 0.3281103 , -0.64092567, 0.56203085, 0.04087684],
[ 0.45153738, -0.64092567, 0.61875389, 0.83631808],
[-0.16559799, 2.98909581, -1.25310662, -1.01971148],
[ 0.57496445, -1.3215547 , 0.67547694, 0.43859746],
[ 0.69839152, -0.41404933, 0.33513866, 0.17345038],
[-0.90616043, 1.62783776, -1.02621444, -1.01971148],
[ 1.19209981, -0.64092567, 0.61875389, 0.30602392],
[-0.90616043, 0.94720873, -1.30982967, -1.15228502],
[-1.89357701, -0.18717298, -1.47999881, -1.4174321 ],
[ 0.08125616, -0.18717298, 0.78892303, 0.83631808],
[ 0.69839152, -0.64092567, 1.07253826, 1.2340387 ],
[-0.28902506, -0.64092567, 0.67547694, 1.10146516],
[-0.41245214, -1.54843104, -0.00519961, -0.22427024],
[ 1.31552689, 0.03970336, 0.67547694, 0.43859746],
[ 0.57496445, 0.72033239, 1.07253826, 1.63175932],
[ 0.82181859, -0.18717298, 1.18598435, 1.36661224],
[-0.16559799, 1.62783776, -1.13966053, -1.15228502],
[ 0.94524567, -0.41404933, 0.5053078 , 0.17345038],
[ 1.06867274, 0.49345605, 1.12926131, 1.76433286],
[-1.27644165, -0.18717298, -1.30982967, -1.4174321 ],
[-1.0295875 , 1.17408507, -1.30982967, -1.28485856],
[ 0.20468323, -0.18717298, 0.61875389, 0.83631808],
[-1.0295875 , -0.18717298, -1.19638358, -1.28485856],
[ 0.3281103 , -0.18717298, 0.67547694, 0.83631808],
[ 0.69839152, 0.03970336, 1.01581521, 0.83631808],
[-0.90616043, 1.40096142, -1.25310662, -1.01971148],
[-0.16559799, -0.18717298, 0.27841562, 0.04087684],
[-1.0295875 , 0.94720873, -1.36655271, -1.15228502],
[-0.90616043, 1.62783776, -1.25310662, -1.15228502],
[-1.52329579, 0.2665797 , -1.30982967, -1.28485856],
[-0.53587921, -0.18717298, 0.44858475, 0.43859746],
[ 0.82181859, -0.64092567, 0.5053078 , 0.43859746],
[ 0.3281103 , -0.64092567, 0.16496953, 0.17345038],
[-1.27644165, 0.72033239, -1.19638358, -1.28485856],
[-0.90616043, 0.49345605, -1.13966053, -0.88713794],
[-0.04217092, -0.86780201, 0.78892303, 0.96889162],
[-0.28902506, -0.18717298, 0.22169257, 0.17345038],
[ 0.57496445, -0.64092567, 0.78892303, 0.43859746],
[ 1.06867274, 0.49345605, 1.12926131, 1.2340387 ],
[ 1.68580811, -0.18717298, 1.18598435, 0.571171 ],
[ 1.06867274, -0.18717298, 0.84564608, 1.49918578],
[-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],
[-1.15301457, -1.3215547 , 0.44858475, 0.70374454],
[-0.16559799, -1.3215547 , 0.73219998, 1.10146516],
[-1.15301457, -1.54843104, -0.2320918 , -0.22427024],
[-0.41245214, -1.54843104, 0.05152343, -0.0916967 ],
[ 1.06867274, -1.3215547 , 1.18598435, 0.83631808],
[ 0.82181859, -0.18717298, 1.01581521, 0.83631808],
[-0.16559799, -1.09467835, -0.1186457 , -0.22427024],
[ 0.20468323, -2.00218372, 0.73219998, 0.43859746],
[ 1.06867274, 0.03970336, 0.56203085, 0.43859746],
[-1.15301457, 0.03970336, -1.25310662, -1.4174321 ],
[ 0.57496445, -1.3215547 , 0.73219998, 0.96889162],
[-1.39986872, 0.2665797 , -1.36655271, -1.28485856],
[ 0.20468323, -0.86780201, 0.78892303, 0.571171 ],
[-0.04217092, -1.09467835, 0.16496953, 0.04087684],
[ 1.31552689, 0.2665797 , 1.12926131, 1.49918578],
[-1.77014994, -0.18717298, -1.36655271, -1.28485856],
[ 1.56238103, -0.18717298, 1.2427074 , 1.2340387 ],
[ 1.19209981, 0.2665797 , 1.2427074 , 1.49918578],
[-0.78273335, 0.94720873, -1.25310662, -1.28485856],
[ 2.54979762, 1.62783776, 1.52632263, 1.10146516],
[ 0.69839152, -0.64092567, 1.07253826, 1.36661224],
[-0.28902506, -0.41404933, -0.06192266, 0.17345038],
[-0.41245214, 2.53534313, -1.30982967, -1.28485856],
[-1.27644165, -0.18717298, -1.30982967, -1.15228502],
[ 0.57496445, -0.41404933, 1.07253826, 0.83631808],
[-1.77014994, 0.2665797 , -1.36655271, -1.28485856],
[-0.53587921, 1.8547141 , -1.13966053, -1.01971148],
[-1.0295875 , 0.72033239, -1.19638358, -1.01971148],
[ 1.06867274, -0.18717298, 0.73219998, 0.70374454],
[-0.53587921, 1.8547141 , -1.36655271, -1.01971148],
[ 2.30294347, -0.64092567, 1.69649176, 1.10146516],
[-0.28902506, -0.86780201, 0.27841562, 0.17345038],
[ 1.19209981, -0.18717298, 1.01581521, 1.2340387 ],
[-0.41245214, 0.94720873, -1.36655271, -1.28485856],
[-1.27644165, 0.72033239, -1.02621444, -1.28485856],
[-0.53587921, 0.72033239, -1.13966053, -1.28485856],
[ 2.30294347, 1.62783776, 1.69649176, 1.36661224],
[ 1.31552689, 0.03970336, 0.95909217, 1.2340387 ],
[-0.28902506, -1.3215547 , 0.10824648, -0.0916967 ],
[-0.90616043, 0.72033239, -1.25310662, -1.28485856],
[-0.90616043, 1.62783776, -1.19638358, -1.28485856],
[ 0.3281103 , -0.41404933, 0.56203085, 0.30602392],
[-0.04217092, 2.08159044, -1.42327576, -1.28485856],
[-1.0295875 , -2.45593641, -0.1186457 , -0.22427024],
[ 0.69839152, 0.2665797 , 0.44858475, 0.43859746],
[ 0.3281103 , -0.18717298, 0.5053078 , 0.30602392],
[ 0.08125616, 0.2665797 , 0.61875389, 0.83631808],
[ 0.20468323, -2.00218372, 0.16496953, -0.22427024],
[ 1.93266225, -0.64092567, 1.35615349, 0.96889162]])
X_test =standarScaler.transform(X_text)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=3, p=2,
weights='uniform')
knn_clf.score(X_test,y_test)
1.0