0%

Part1-异常检测和推荐系统练习

Part1-异常检测和推荐系统练习

异常检测

1.1 数据可视化

1
2
3
4
5
6
7
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
data = loadmat('ex8data1.mat')
X = data['X']
plt.scatter(X[:,0],X[:,1])
plt.show()

1.2 高斯分布参数估计

1
2
3
4
m = len(X)
u = (np.sum(X, axis=0))/m
sigma = (np.sum((X-u)**2, axis=0))/m
u, sigma

(array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))

1
2
3
4
5
6
7
x = np.linspace(0, 25, 100)
y = np.linspace(0, 25, 100)
xx, yy = np.meshgrid(x, y)
Z = np.exp(-((xx-u[0])**2/sigma[0]+(yy-u[1])**2/sigma[1])/2)
plt.contour(xx, yy, Z, [10**-11,10**-7,10**-5,10**-3,0.1])
plt.scatter(X[:,0],X[:,1])
plt.show()

1.3 选择阈值

1
2
3
def p(X,u,sigma):
t = (np.exp(-(X-u)**2/(2*sigma)))/np.sqrt(2*np.pi*sigma)
return np.prod(t, axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def select_epsilon(pred, yval):
best_epsilon = 0
best_f1 = 0
step = (pred.max()-pred.min())/1000
for epsilon in np.arange(pred.min(), pred.max(), step):
pred = pred < epsilon
tp = np.sum(np.logical_and(pred==1, yval==1))
fp = np.sum(np.logical_and(pred==1, yval==0))
fn = np.sum(np.logical_and(pred==0, yval==1))
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = (2*precision*recall)/(precision+recall)
if f1 > best_f1:
best_f1 = f1
best_epsilon = epsilon

return best_epsilon, best_f1
1
2
3
pred = p(data['Xval'], u, sigma)
epsilon, f1 = select_epsilon(pred, data['yval'])
epsilon, f1

(8.990852779269493e-05, 0.056962025316455694)

1
2
3
4
res = np.where(pred < epsilon)
plt.scatter(X[:,0], X[:,1])
plt.scatter(X[res[0], 0], X[res[0], 1], s=100, color='r', marker='x')
plt.show()

1.4 高维数据

1
2
3
4
5
from scipy import stats
p = stats.multivariate_normal(data['X'].mean(axis=0), np.cov(data['X'].T)).pdf(data['X'])
pred = stats.multivariate_normal(data['X'].mean(axis=0), np.cov(data['X'].T)).pdf(data['Xval'])
epsilon, f1 = select_epsilon(pred, data['yval'])
epsilon, f1

(1.7464996396712342e-18, 0.18181818181818182)

1
2
res = np.where(p < epsilon)
len(res[0])

122

检测结果显示有122个异常点

推荐系统

2.1 加载数据

1
2
3
4
5
6
7
8
data = loadmat('ex8_movies.mat')
data_param = loadmat('ex8_movieParams.mat')
# Y是包含从1到5的等级评分,R是表示用户是否对电影进行评分
R = data['R']
Y = data['Y']
X = data_param['X']
theta = data_param['Theta']
Y.shape,R.shape

((1682, 943), (1682, 943))

2.2 协同过滤代价函数

1
2
3
4
5
6
7
8
def serialize(X, theta):
return np.concatenate((X.ravel(), theta.ravel()))
def deserialize(param, n_m, n_u, n):
return param[:n_m*n].reshape(n_m, n), param[n_m*n:].reshape(n_u, n)
def cost(param, Y, R, n):
n_m, n_u = Y.shape
X, theta = deserialize(param, n_m, n_u, n)
return np.power((X@theta.T-Y)*R,2).sum()/2

2.3 协同过滤梯度

1
2
3
4
5
6
7
def gradient(param, Y, R, n):
n_m, n_u = Y.shape
X, theta = deserialize(param, n_m, n_u, n)
inner = (X@theta.T-Y)*R
X_grad = inner@theta
theta_grad = inner.T@X
return serialize(X_grad, theta_grad)

2.3 正则化

1
2
3
4
5
6
def reg_cost(param, Y, R, n, lamb=1):
reg_term = np.power(param, 2).sum()*(lamb/2)
return cost(param, Y, R, n)+reg_term
def reg_gradient(param, Y, R, n, lamb=1):
reg_term = 1*param
return gradient(param, Y, R, n)+reg_term

2.4 预测电影评分

2.4.1 读取数据

1
2
3
4
5
movie_list = []
with open('movie_id.txt') as f:
for line in f:
movie_list.append(''.join(line.strip().split(' ')[1:]))
movie_list

2.4.2 初始化自己对电影的评分

1
2
3
4
5
6
7
8
9
10
11
12
13
14

ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

2.4.3 数据预处理

1
2
3
4
5
6
7
Y = np.insert(Y, 0, ratings, axis=1)
R = np.insert(R, 0, ratings!=0, axis=1)
n = 10
n_m, n_u = Y.shape
X = np.random.standard_normal((n_m, n))
theta = np.random.standard_normal((n_u, n))
X.shape, theta.shape

((1682, 10), (944, 10))

1
2
param = serialize(X, theta)
param.shape

(26260,)

1
2
Y_norm = Y - Y.mean(axis=1, keepdims=True)
Y_norm.mean()

4.600291296941337e-18

1
2
import scipy.optimize as opt
res = opt.minimize(fun = reg_cost, x0=param, args=(Y_norm, R, n, 1), method='TNC', jac=reg_gradient)
1
2
X_trained,theta_trained=deserialize(res.x,n_m,n_u,n)
X_trained.shape,theta_trained.shape

((1682, 10), (944, 10))

1
2
3
4
pred = X_trained@theta_trained.T
p = pred[:, 0]+Y.mean()
idx = np.argsort(p)[::-1]
idx.shape

(1682,)

1
2
for m in movie_list[idx][:10]:
print(m)

StarTrek:Generations(1994)

DieHard:WithaVengeance(1995)

LiveNudeGirls(1995)

PrivateParts(1997)

Sphere(1998)

HauntedWorldofEdwardD.WoodJr.,The(1995)

ThreeCaballeros,The(1945)

Michael(1996) Sneakers(1992)

WeddingSinger,The(1998)