Part1-正则化线性回归和偏差方差练习

正则化线性回归

1.1 数据可视化

1 2	from scipy.io import loadmat data = loadmat('ex5data1.mat')

1	X, y, Xval, yval, Xtest, ytest = map(np.ravel, [data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])

1
2
3

import matplotlib.pyplot as plt
plt.scatter(X, y, marker='x', c='r')
plt.show()

1.2 正则化线性回归代价函数

import numpy as np
X = X.reshape(-1, 1)
X = np.insert(X, 0, 1, axis=1)
theta = np.ones(X.shape[1])

def reg_cost(theta, X, y, lamb):
  m = X.shape[0]
  t = X@theta-y # m*n@n*1
  cost = (t.T@t)/(2*m)
  reg = lamb/(2*m)*np.sum(theta[1:]**2)
  return cost+reg

1.3 正则化线性回归梯度计算

def reg_gradient(theta, X, y, lamb):
  m = len(X)
  grad = (X.T@(X@theta-y))/m
  reg = lamb/m*theta # 1*2
  reg[0] = 0
  return grad + reg

1.4 拟合线性回归

1
2
3

import scipy.optimize as opt 
res = opt.minimize(fun=reg_cost, x0=theta, args=(X, y, 0), method='TNC', jac=reg_gradient)
res

fun: 22.373906495108926 jac: array([ 1.52549682e-07, -7.76071267e-09])

message: 'Converged (|f_n-f_(n-1)| ~= 0)' nfev: 9 nit: 4 status: 1 success: True x: array([13.08790367, 0.36777923])

plt.scatter(data['X'], y)
temp_X = np.arange(-50,40)
plt.plot(temp_X, res.x[0]+temp_X*res.x[1])
plt.show()

偏差和方差

2.1 学习曲线

1
2
3

m = X.shape[0]
Xval = Xval.reshape(-1,1)
Xval = np.insert(Xval, 0, 1, axis=1)

train_cost = []
cv_cost = []
for i in range(1, m+1):
  res = opt.minimize(fun=reg_cost, x0=np.ones(X.shape[1]), args=(X[:i], y[:i], 1), method='TNC', jac=reg_gradient)
  # c = opt.minimize(fun=reg_cost, x0=theta, args=(X[0:i], y[0:i], 0), method='TNC', jac=reg_gradient).fun
  train_cost.append(reg_cost(res.x, X[:i], y[:i], 0))
  # c = opt.minimize(fun=reg_cost, x0=theta, args=(Xval[0:i], yval[0:i], 0), method='TNC', jac=reg_gradient).fun
  cv_cost.append(reg_cost(res.x, Xval, yval, 0))

plt.plot(np.arange(1,m+1), cv_cost, label='cv cost')
plt.plot(np.arange(1,m+1), train_cost, label='train cost')
plt.legend()
plt.show()

可以看出，这个模型的拟合并不好，欠拟合了

多项式回归

3.1 多项式特征

1 2	# 重新获取数据 X, y, Xval, yval, Xtest, ytest = map(np.ravel, [data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']])

import pandas as pd
def poly_features(X, power):
  data = {f'f{i}':np.power(X, i) for i in range(1, power+1)}
  return pd.DataFrame(data)

1	poly_features(X, 3).head() # 测试

3.2 构建多项式数据

1
2
3

# 归一化
def normalize_feature(df):
  return df.apply(lambda c: (c-c.mean())/c.std())

# 构建多项式数据
def prepare_poly_data(*args, power):
  def prepare(X):
    df = poly_features(X, power)
    t = normalize_feature(df).values
    return np.insert(t, 0, 1,axis=1)
  return [prepare(x) for x in args]

1 2	X_poly, Xval_poly, Xtest_poly = prepare_poly_data(X, Xval, Xtest, power=8) X_poly[:3,:] # 测试

3.2 多项式回归

# 画学习曲线
def plot_learning_curve(X, y, Xval, yval, lamb):
  train_cost, cv_cost = [], []
  m = X.shape[0]
  for i in range(1, m+1):
    res = opt.minimize(fun=reg_cost, x0=np.ones(X.shape[1]), args=(X[:i], y[:i], lamb), method='TNC', jac=reg_gradient)
    # c = opt.minimize(fun=reg_cost, x0=theta, args=(X[0:i], y[0:i], 0), method='TNC', jac=reg_gradient).fun
    train_cost.append(reg_cost(res.x, X[:i], y[:i], lamb))
    # c = opt.minimize(fun=reg_cost, x0=theta, args=(Xval[0:i], yval[0:i], 0), method='TNC', jac=reg_gradient).fun
    cv_cost.append(reg_cost(res.x, Xval, yval, lamb))
  plt.plot(np.arange(1,m+1), cv_cost, label='cv cost')
  plt.plot(np.arange(1,m+1), train_cost, label='train cost')
  plt.legend()
  plt.show()

1	plot_learning_curve(X_poly, y, Xval_poly, yval, 0)

可以看出，训练误差一直很低且为0，验证误差逐渐减小，这是过拟合的表现

3.3 调整正则化参数

当\(\lambda=1\)时：

1	plot_learning_curve(X_poly, y, Xval_poly, yval, 1)

可以看出，训练误差还是很小，但不再是一起为0了，因此这是减轻后的过拟合

当\(\lambda=100\)时

1	plot_learning_curve(X_poly, y, Xval_poly, yval, 100)

正则化太严重，欠拟合了

找到最佳的\(\lambda\)

lamb_candidate = [(2**t)*0.01 for t in range(11)]
lamb_candidate.insert(0, 0)
train_cost,cv_cost = [], []
for lamb in lamb_candidate:
  res = opt.minimize(fun=reg_cost, x0=np.ones(X_poly.shape[1]), args=(X_poly, y, lamb), method='TNC', jac=reg_gradient)
  train = reg_cost(res.x, X_poly, y, lamb)
  cv = reg_cost(res.x, Xval_poly, yval, lamb)
  train_cost.append(train)
  cv_cost.append(cv)

plt.plot(lamb_candidate, train_cost, label='train')
plt.plot(lamb_candidate, cv_cost, label='cv')
plt.legend()
plt.xlabel('lambda')
plt.ylabel('cost')
plt.show()

1 2	# 求出最佳lamb lamb_candidate[np.argmin(cv_cost)]

0.32

# 测试集上测试
for lamb in lamb_candidate:
  res = opt.minimize(fun=reg_cost, x0=np.ones(X_poly.shape[1]), args=(X_poly, y, lamb), method='TNC', jac=reg_gradient)
  print(f'test cost(l={lamb})={reg_cost(res.x, Xtest_poly, ytest, lamb)}')

test cost(lamb=0)=9.982275423899827

test cost(lamb=0.01)=10.973543277025396

test cost(lamb=0.02)=10.548120143946353

test cost(lamb=0.04)=10.007174745468829

test cost(lamb=0.08)=9.392941014843759

test cost(lamb=0.16)=8.820134306044448

test cost(lamb=0.32)=8.587713795685527

test cost(lamb=0.64)=9.228251765308313

test cost(lamb=1.28)=11.470274383762167

test cost(lamb=2.56)=16.3078760577592

test cost(lamb=5.12)=24.97737418682823

test cost(lamb=10.24)=38.10335942580261

可以确定，最佳]\(\lambda\)为0.32