Introduction to Computational Mathematics/ Group 31/ Yan-Yu Chen(陳彥宇)/ Econ B05303134
The main goal of this project is to demonstrate how to recognize the images of handwritten digits with a concentration in $3$, $4$, and $5$ by matrix factorization, in particular, the singular value decomposition (SVD) method.
In this experiment, we divided the images from the MNIST database into two groups, the traing data and testing data. We then used SVD to built a classifier from the training data, and classifier evaluation by accuracy form testing data.
import numpy as np
from numpy import linalg as LA
import matplotlib.pyplot as plt
import seaborn as sns
The images are collected from the famous MNIST (Modified National Institute of Standards and Technology) database. It is a database of handwritten digits that is commonly used for training various image processing systems. The data contains $60,000$ training images and $10,000$ testing images.Each image has been normalized to $28 \times 28$ pixels.
mnist = np.load('mnist.npz')
x_train = mnist['x_train']
y_train = mnist['y_train']
x_train = x_train[y_train >= 3]
y_train = y_train[y_train >= 3]
x_train = x_train[y_train <= 5].reshape((14466,28,28))
y_train = y_train[y_train <= 5]
x_test = mnist['x_test']
y_test = mnist['y_test']
x_test = x_test[y_test >= 3]
y_test = y_test[y_test >= 3]
x_test = x_test[y_test <= 5].reshape((2884,28,28))
y_test = y_test[y_test <= 5]
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)
x_train3 = x_train[y_train == 3]
x_train4 = x_train[y_train == 4]
x_train5 = x_train[y_train == 5]
x_train3 = x_train[y_train == 3]
x_train4 = x_train[y_train == 4]
x_train5 = x_train[y_train == 5]
fig = plt.figure(figsize=(9,6), dpi=80)
plt.subplot(1,3,1, xticks=[], yticks=[])
plt.imshow(x_train3[0], cmap='Greys')
plt.xlabel(str(x_train3.shape[0])+
' images ('+str(round(x_train3.shape[0]/x_train.shape[0]*100))+'%)')
plt.subplot(1,3,2, xticks=[], yticks=[])
plt.imshow(x_train4[0], cmap='Greys')
plt.xlabel(str(x_train4.shape[0])+
' images ('+str(round(x_train4.shape[0]/x_train.shape[0]*100))+'%)')
plt.subplot(1,3,3, xticks=[], yticks=[])
plt.imshow(x_train5[0], cmap='Greys')
plt.xlabel(str(x_train5.shape[0])+
' images ('+str(round(x_train5.shape[0]/x_train.shape[0]*100))+'%)')
plt.show()
fig = plt.figure(figsize=(6, 6))
fig.subplots_adjust(left=0, right=1,top=1, bottom=0, hspace=0.05, wspace=0.05)
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(x_train[i], cmap='Greys',interpolation='nearest')
ax.text(0, 7, str(y_train[i]) ,color='g')
plt.show()
sample3 = x_train3/255
sample4 = x_train4/255
sample5 = x_train5/255
all_mean3 = sample3.mean(axis=(1,2))
all_mean4 = sample4.mean(axis=(1,2))
all_mean5 = sample5.mean(axis=(1,2))
fig = plt.figure(figsize=(12,5), dpi=80)
plt.hist(all_mean3, bins=50, color='y', label='3')
plt.hist(all_mean4, bins=50, color='g', alpha=0.5, label='4')
plt.hist(all_mean4, bins=50, color='c', alpha=0.5, label='5')
plt.axvline(all_mean3.mean(), color='b', linestyle='dashed', linewidth=1, label='mean of 3')
plt.axvline(all_mean4.mean(), color='r', linestyle='dashed', linewidth=1, label='mean of 4')
plt.axvline(all_mean5.mean(), color='m', linestyle='dashed', linewidth=1, label='mean of 5')
plt.legend()
plt.title('Average')
plt.show()
We use multidimensional scaling (MDS) to give a skretch of our training data.
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
plt.figure(figsize=(12,5), dpi=80)
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(y_train[i]),
color=plt.cm.Set1(y_train[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)
from sklearn import manifold
clf = manifold.MDS(n_components=2, n_init=1, max_iter=100)
X_mds = clf.fit_transform(x_train.reshape(14466, 784)[:700,],y_train[:700,])
plot_embedding(X_mds,"MDS Embedding of the Digits")
plt.show()
Following the singular value decomposition (SVD) method described in Chapter 10 of the book Matrix Methods in Data Mining and Pattern Recognition by Lars Elden, this experiment's algorithm is to recognize the images of handwritten digits with the following two steps:
x_train3 = x_train3.reshape(5101,784)
x_train4 = x_train4.reshape(4859,784)
x_train5 = x_train5.reshape(4506,784)
x = np.linspace(1,784)
fig = plt.figure(figsize=(9,5), dpi=80)
plt.subplot(2,1,1, xticks=[], yticks=[])
plt.imshow(x_train3[0].reshape(28,28), cmap="Greys")
plt.subplot(2,1,2, yticks=[])
extent = [x[0]-(x[1]-x[0])/2., x[-1]+(x[1]-x[0])/2.,0,1]
plt.imshow(x_train3[0][np.newaxis,:], cmap="Greys", aspect="auto", extent=extent)
plt.title('One Raster Vector of Digit 3')
plt.show()
x_train_ = [x_train3, x_train4, x_train5]
fig = plt.figure(figsize=(9, 5))
fig.subplots_adjust(left=0, right=1,top=1, bottom=0, hspace=0.2, wspace=0.05)
for i in range(3):
raster = x_train_[i]
ax = fig.add_subplot(3, 1, i + 1, xticks=[])
ax.imshow(raster.transpose(), cmap='Greys', interpolation='nearest')
plt.title('Raster Vectors of Digit '+str(i+3))
plt.show()
For each digit $i$, we have a corresponding matrix $A$ in $\mathbb{R}^{n_i\times784}$, where $n_i$ is the number of pictures. Decompose $A$ into $$A = USV^t, $$ where $U$, $V$ are orthogonal matrices, and $S$ is a diagonal matrix with singular values in its diagonal line. Since the first few singular images are more significient than others, Lars Elden selected the first $3$ singular images as bases in the textbook, and our later experiment advised us to select the first $42$ singular images in MNIST database.
U, S, V = LA.svd(x_train3)
print('The first 10 singualr values of digit 3 are \n',S[:10],'.')
fig = plt.figure(figsize=(9,4), dpi=80)
plt.plot(S)
plt.plot(3, S[2], 'om')
plt.plot(41, S[40], 'or')
plt.axvline(3, color='m', linestyle='dashed', linewidth=1, label='3rd')
plt.axvline(41, color='r', linestyle='dashed', linewidth=1, label='42th')
plt.title('Singular Values of Digit 3')
plt.legend()
plt.show()
S_ = np.zeros((x_train3.shape[0], x_train3.shape[1]))
fig = plt.figure(figsize=(6, 6))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
for i in range(64):
A = V[i,]
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(A.reshape(28,28), cmap='Greys', interpolation='nearest')
if i == 0:
tmp = str(i+1)+'st'
elif i == 1:
tmp = str(i+1)+'nd'
elif i == 2:
tmp = str(i+1)+'rd'
else:
tmp =str(i+1)+'th'
if i == 2:
ax.text(0, 6, tmp, color = 'm')
elif i == 41:
ax.text(0, 6, tmp, color = 'r')
else:
ax.text(0, 6, tmp, color = 'g')
u = []
s = []
v = []
all_s = []
all_v = []
x_sing = []
for i in range(3):
U, S, V = LA.svd(x_train_[i])
S_ = np.zeros((np.array(x_train_)[i].shape[0], np.array(x_train_)[i].shape[1]))
S_[:np.array(x_train_)[i].shape[1], :np.array(x_train_)[i].shape[1]] = np.diag(S)
all_s.append(S_)
all_v.append(V)
n_component = 3 # Take the first three vectors
S_ = S_[:, :n_component]
VT = V[:n_component, :]
u.append(U)
s.append(S_)
v.append(VT)
A = U.dot(S_.dot(VT))
x_sing.append(A)
fig = plt.figure(figsize=(6, 6))
fig.subplots_adjust(left=0, right=1,top=1, bottom=0, hspace=0.05, wspace=0.05)
for i in range(9):
sing_img = np.array(v).reshape(9,784)[i]
ax = fig.add_subplot(3, 3, i + 1, xticks=[], yticks=[])
ax.imshow(sing_img.reshape(28,28), cmap='Greys', interpolation='nearest')
j = i
while j > 2:
j -= 3
if j == 0:
tmp = str(j+1)+'st'
elif j == 1:
tmp = str(j+1)+'nd'
elif j == 2:
tmp = str(j+1)+'rd'
ax.text(0, 3, tmp, color = 'g')
plt.show()
The residual of each raster vector $v$ is calculated by $$\text{residual}(v)=|| (I-V_3V_3^t)v||_2, $$ where $V_3 = (v_1,v_2,v_3)$, and $3$ is the parameter of the number of singular images we choose. Classify $v$ into the digit $i$ with the minimal residual. That is, $$\hat{i}=\text{arg}\min_i\{\text{residual}_i(v)\} .$$
def residual(U, z):
n = U.shape[0]
return LA.norm(z-U.transpose().dot(U.dot(z))) / LA.norm(z)
x_test = x_test.reshape(2884,784)
tmp = []
v_tmp = np.zeros(784 * 784).reshape(784,784)
for i in range(3):
v_tmp[:3,:] = v[i]
tmp.append(residual(v_tmp, x_test[0]))
plt.figure(figsize=(9,5),dpi=80)
plt.subplot(2,1,1, xticks = [], yticks = [])
plt.imshow(x_test[0].reshape(28,28),cmap='Greys')
plt.subplot(2,1,2)
plt.plot(np.linspace(3, 5, 3), tmp)
plt.plot(4, tmp[1], 'ob')
plt.title('Residuals from Singular Images')
plt.ylabel('Residuals')
plt.xlabel('# basis vector')
plt.xticks(np.linspace(3, 5, 3))
plt.show()
y_pred_ = []
v_tmp = np.zeros(784 * 784).reshape(784,784)
for j in range(len(x_test)):
dist_ = []
for i in range(3):
v_tmp[:3,:] = v[i]
dist_.append(residual(v_tmp, x_test[j]))
y_pred_.append(np.argmin(dist_)+3)
accuracy_ = sum(y_pred_ == y_test)/len(y_test)*100
print('The accuracy is', accuracy_, '%.')
The red number is our prediction, and the green number is the true label. It seems that there are some badly drawn images that are too difficult for the SVD based algorithm to recognize. (In fact, even a human may missclifcassify them!)
xmis_pred_ = x_test[y_pred_ != y_test]
ymis_pred_ = np.array(y_pred_)[y_pred_ != y_test]
tmp = y_test[y_pred_ != y_test]
fig = plt.figure(figsize=(6, 6))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(xmis_pred_[i].reshape(28,28), cmap='Greys', interpolation='nearest')
ax.text(0, 7, str(ymis_pred_[i]) ,color='r')
ax.text(0, 14, str(tmp[i]) ,color='g')
We define a confusion function to visualize the true label/prediction into a matrix.
def confusion(y_test, y_pred):
confusion_matrix = np.zeros(9).reshape(3,3)
for i in range(len(y_test)):
if y_pred[i] == y_test[i] and y_test[i] == 3:
confusion_matrix[0][0] += 1
elif y_test[i] == 3 and y_pred[i] == 4:
confusion_matrix[0][1] += 1
elif y_test[i] == 3 and y_pred[i] == 5:
confusion_matrix[0][2] += 1
elif y_test[i] == 4 and y_pred[i] == 3:
confusion_matrix[1][0] += 1
elif y_pred[i] == y_test[i] and y_test[i] == 4:
confusion_matrix[1][1] += 1
elif y_test[i] == 4 and y_pred[i] == 5:
confusion_matrix[1][2] += 1
elif y_test[i] == 5 and y_pred[i] == 3:
confusion_matrix[2][0] += 1
elif y_test[i] == 5 and y_pred[i] == 4:
confusion_matrix[2][1] += 1
elif y_pred[i] == y_test[i] and y_test[i] == 5:
confusion_matrix[2][2] += 1
confusion_prob = []
for i in range(3):
confusion_prob.append(1/sum(y_test == i+3) * confusion_matrix[i])
return confusion_matrix, confusion_prob
plt.figure(figsize=(12, 5))
labels = [3, 4, 5]
plt.subplot(1,2,1)
y_confusion_ = confusion(y_test, y_pred_)
ax = sns.heatmap(y_confusion_[0], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
plt.subplot(1,2,2)
ax = sns.heatmap(y_confusion_[1], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Probability')
plt.show()
y_pred_a = []
for j in range(len(x_test)):
dist_ = []
for i in range(3):
dist_.append(residual(all_v[i], x_test[j]))
y_pred_a.append(np.argmin(dist_)+3)
accuracy_a = sum(y_pred_a == y_test)/len(y_test)*100
print('The accuracy is', accuracy_a, '%.')
plt.figure(figsize=(12, 5))
labels = [3, 4, 5]
plt.subplot(1,2,1)
y_confusion_a = confusion(y_test, y_pred_a)
ax = sns.heatmap(y_confusion_a[0], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
plt.subplot(1,2,2)
ax = sns.heatmap(y_confusion_a[1], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Probability')
plt.show()
To find the best number of orthogonal bases for classifiying the training data into the correct labels, we first calculate the residuals of each number of bases on the training data, and then compare their performance. The best number of singular images is the highest accuracy of all the number of singular images on the training data. Namely, $$ \hat{k}_{\text{best}} := \text{arg}\max_{k} \{\text{accuracy of $k$ singular images on the training data}\}.$$
x_train = x_train.reshape(14466,784)
accuracy_train_b = []
accuracy_test_b = []
for j in range(400):
n_component = j+1
v_train_b = []
for i in range(3):
VT = all_v[i][:n_component, :]
v_train_b.append(VT)
y_pred_train_b_tmp = []
y_pred_test_b_tmp = []
for k in range(len(x_train)):
dist_train = []
for s in range(3):
dist_train.append(residual(v_train_b[s], x_train[k]))
y_pred_train_b_tmp.append(np.argmin(dist_train)+3)
for k in range(len(x_test)):
dist_test = []
for s in range(3):
dist_test.append(residual(v_train_b[s], x_test[k]))
y_pred_test_b_tmp.append(np.argmin(dist_test)+3)
accuracy_train_tmp = sum(y_pred_train_b_tmp == y_train)/len(y_train)*100
accuracy_train_b.append(accuracy_train_tmp)
accuracy_test_tmp = sum(y_pred_test_b_tmp == y_test)/len(y_test)*100
accuracy_test_b.append(accuracy_test_tmp)
idx = np.argmax(accuracy_train_b)
fig = plt.figure(figsize=(9,6), dpi=80)
plt.plot(accuracy_train_b, label = 'train')
plt.plot(accuracy_test_b, label = 'test')
plt.axvline(2, linestyle='dashed', linewidth=1, color = 'm', label = '3rd')
plt.axvline(idx, linestyle='dashed', linewidth=1, color = 'r', label = str(idx+1)+'th')
plt.ylabel('%')
plt.xlabel('# singular images')
plt.title('Accuracy')
plt.legend()
plt.show()
print('The best number of singualr images is', idx+1, ',')
print('and the corresponding accuracy on the testing data is', accuracy_test_b[idx], '%.')
print('The best potentional accuracy on the testing data is', np.max(accuracy_test_b), '%.')
The red number is our prediction, and the green number is the true label. It looks like most of the remaining missclassifications are the the drawings that are too thick. Moreover, this phenomenon only happended on digits $3$ and $5$.
y_pred_b = []
v_tmp = np.zeros(784 * 784).reshape(784,784)
for j in range(len(x_test)):
dist_ = []
for i in range(3):
v_tmp[:idx,:] = all_v[i][:idx,:]
dist_.append(residual(v_tmp, x_test[j]))
y_pred_b.append(np.argmin(dist_)+3)
xmis_pred_b = x_test[y_pred_b != y_test]
ymis_pred_b = np.array(y_pred_b)[y_pred_b != y_test]
tmp = y_test[y_pred_b != y_test]
fig = plt.figure(figsize=(6, 6))
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
for i in range(39):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(xmis_pred_b[i].reshape(28,28), cmap='Greys', interpolation='nearest')
ax.text(0, 7, str(ymis_pred_b[i]) ,color='r')
ax.text(0, 14, str(tmp[i]) ,color='g')
plt.figure(figsize=(12, 5))
labels = [3, 4, 5]
plt.subplot(1,2,1)
y_confusion_b = confusion(y_test, y_pred_b)
ax = sns.heatmap(y_confusion_b[0], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
plt.subplot(1,2,2)
ax = sns.heatmap(y_confusion_b[1], linewidth=1, annot=True, xticklabels=labels, yticklabels = labels,
cmap="Blues")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.title('Confusion Probability')
plt.show()
To overcome the disadvantages of the SVD method, may be we could...
We used the hierarchical clustering method to cluster the raster vector of each digit over columns. Based on the following visualization, we found that it seems like there may be some sublabels in digits $3$ and $5$, especially, digit $5$, which may improve the performance of the SVD method.
test_f1 = sns.clustermap(x_train3.transpose(), cmap="Greys", row_cluster=False)
test_f2 = sns.clustermap(x_train4.transpose(), cmap="Greys", row_cluster=False)
test_f3 = sns.clustermap(x_train5.transpose(), cmap="Greys", row_cluster=False)