import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat data = loadmat('ex8data1.mat') X = data['X'] plt.scatter(X[:,0],X[:,1]) plt.show()
1.2 高斯分布参数估计
1 2 3 4
m = len(X) u = (np.sum(X, axis=0))/m sigma = (np.sum((X-u)**2, axis=0))/m u, sigma
from scipy import stats p = stats.multivariate_normal(data['X'].mean(axis=0), np.cov(data['X'].T)).pdf(data['X']) pred = stats.multivariate_normal(data['X'].mean(axis=0), np.cov(data['X'].T)).pdf(data['Xval']) epsilon, f1 = select_epsilon(pred, data['yval']) epsilon, f1
(1.7464996396712342e-18, 0.18181818181818182)
1 2
res = np.where(p < epsilon) len(res[0])
122
检测结果显示有122个异常点
推荐系统
2.1 加载数据
1 2 3 4 5 6 7 8
data = loadmat('ex8_movies.mat') data_param = loadmat('ex8_movieParams.mat') # Y是包含从1到5的等级评分,R是表示用户是否对电影进行评分 R = data['R'] Y = data['Y'] X = data_param['X'] theta = data_param['Theta'] Y.shape,R.shape
defreg_cost(param, Y, R, n, lamb=1): reg_term = np.power(param, 2).sum()*(lamb/2) return cost(param, Y, R, n)+reg_term defreg_gradient(param, Y, R, n, lamb=1): reg_term = 1*param return gradient(param, Y, R, n)+reg_term
2.4 预测电影评分
2.4.1 读取数据
1 2 3 4 5
movie_list = [] withopen('movie_id.txt') as f: for line in f: movie_list.append(''.join(line.strip().split(' ')[1:])) movie_list