Class Matrix
import random
class Matrix(list):
@classmethod
def zeros(cls, shape):
n_rows, n_cols = shape
return cls([[0] * n_cols for i in range(n_rows)])
@classmethod
def random(cls, shape):
M, (n_rows, n_cols) = cls(), shape
for i in range(n_rows):
M.append([random.randint(-255, 255)
for j in range(n_cols)])
return M
def transpose(self):
n_rows, n_cols = self.shape
return self.__class__(zip(*self))
@property
def shape(self):
return ((0, 0) if not self else (len(self), len(self[0])))
def matrix_product(X, Y):
"""Computes the matrix product of X and Y.
>>> X = Matrix([[1], [2], [3]])
>>> Y = Matrix([[4, 5, 6]])
>>> matrix_product(X, Y)
[[4, 5, 6], [8, 10, 12], [12, 15, 18]]
>>> matrix_product(Y, X)
[[32]]
"""
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
# верим, что с размерностями всё хорошо
Z = Matrix.zeros((n_xrows, n_ycols))
for i in range(n_xrows):
for j in range(n_xcols):
for k in range(n_ycols):
Z[i][k] += X[i][j] * Y[j][k]
return Z
%doctest_mode
Exception reporting mode: Plain
Doctest mode is: ON
>>> X = Matrix([[1], [2], [3]])
>>> Y = Matrix([[4, 5, 6]])
>>> matrix_product(X, Y)
[[4, 5, 6], [8, 10, 12], [12, 15, 18]]
>>> matrix_product(Y, X)
[[32]]
[[32]]
%doctest_mode
Exception reporting mode: Context
Doctest mode is: OFF
Runtime measurement
Everything seems to work, but how fast? Use the magic timeit command to check.
%%timeit shape = 64, 64; X = Matrix.random(shape); Y = Matrix.random(shape)
matrix_product(X, Y)
52.5 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Total: Multiplying two 64x64 matrices takes near 0.1 seconds. Y U SO SLOW?
We define an auxiliary function bench, which generates random matrices of the specified size, and then n_iter times multiplies them in a loop.
def bench(shape=(64, 64), n_iter=16):
X = Matrix.random(shape)
Y = Matrix.random(shape)
for iter in range(n_iter):
matrix_product(X, Y)
Let's try to look at it more closely with the help of the line_profiler module.
!pip install line_profiler
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: line_profiler in /home/fedorov/.local/lib/python3.12/site-packages (5.0.0)
%load_ext line_profiler
%lprun -f matrix_product bench()
Timer unit: 1e-09 s
Total time: 5.31022 s
File: /tmp/ipykernel_7428/3720342099.py
Function: matrix_product at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def matrix_product(X, Y):
2 """Computes the matrix product of X and Y.
3
4 >>> X = Matrix([[1], [2], [3]])
5 >>> Y = Matrix([[4, 5, 6]])
6 >>> matrix_product(X, Y)
7 [[4, 5, 6], [8, 10, 12], [12, 15, 18]]
8 >>> matrix_product(Y, X)
9 [[32]]
10 """
11 16 87526.0 5470.4 0.0 n_xrows, n_xcols = X.shape
12 16 31645.0 1977.8 0.0 n_yrows, n_ycols = Y.shape
13 # верим, что с размерностями всё хорошо
14 16 869157.0 54322.3 0.0 Z = Matrix.zeros((n_xrows, n_ycols))
15 1040 582911.0 560.5 0.0 for i in range(n_xrows):
16 66560 34411875.0 517.0 0.6 for j in range(n_xcols):
17 4259840 2224753558.0 522.3 41.9 for k in range(n_ycols):
18 4194304 3049412607.0 727.0 57.4 Z[i][k] += X[i][j] * Y[j][k]
19 16 69070.0 4316.9 0.0 return Z
Note that the operation list .__ getitem__ is not free. Swap the for loops so that the code does less index lookups.
def matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = Matrix.zeros((n_xrows, n_ycols))
for i in range(n_xrows):
Xi = X[i]
for k in range(n_ycols):
acc = 0
for j in range(n_xcols):
acc += Xi[j] * Y[j][k]
Z[i][k] = acc
return Z
%lprun -f matrix_product bench()
Timer unit: 1e-09 s
Total time: 4.99064 s
File: /tmp/ipykernel_7428/481365942.py
Function: matrix_product at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def matrix_product(X, Y):
2 16 94661.0 5916.3 0.0 n_xrows, n_xcols = X.shape
3 16 29810.0 1863.1 0.0 n_yrows, n_ycols = Y.shape
4 16 375080.0 23442.5 0.0 Z = Matrix.zeros((n_xrows, n_ycols))
5 1040 542518.0 521.7 0.0 for i in range(n_xrows):
6 1024 633692.0 618.8 0.0 Xi = X[i]
7 66560 35416069.0 532.1 0.7 for k in range(n_ycols):
8 65536 34007986.0 518.9 0.7 acc = 0
9 4259840 2168283486.0 509.0 43.4 for j in range(n_xcols):
10 4194304 2710807108.0 646.3 54.3 acc += Xi[j] * Y[j][k]
11 65536 40405377.0 616.5 0.8 Z[i][k] = acc
12 16 47729.0 2983.1 0.0 return Z
2 seconds faster, but still too slow:> 30% of the time goes exclusively to iteration! Fix it.
def matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = Matrix.zeros((n_xrows, n_ycols))
for i in range(n_xrows):
Xi, Zi = X[i], Z[i]
for k in range(n_ycols):
Zi[k] = sum(Xi[j] * Y[j][k] for j in range(n_xcols))
return Z
%lprun -f matrix_product bench()
Timer unit: 1e-09 s
Total time: 2.80204 s
File: /tmp/ipykernel_7428/1886970874.py
Function: matrix_product at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def matrix_product(X, Y):
2 16 58653.0 3665.8 0.0 n_xrows, n_xcols = X.shape
3 16 28991.0 1811.9 0.0 n_yrows, n_ycols = Y.shape
4 16 284438.0 17777.4 0.0 Z = Matrix.zeros((n_xrows, n_ycols))
5 1040 566052.0 544.3 0.0 for i in range(n_xrows):
6 1024 776516.0 758.3 0.0 Xi, Zi = X[i], Z[i]
7 66560 38053714.0 571.7 1.4 for k in range(n_ycols):
8 65536 2762244176.0 42148.5 98.6 Zi[k] = sum(Xi[j] * Y[j][k] for j in range(n_xcols))
9 16 24767.0 1547.9 0.0 return Z
The matrix_product functions are pretty prettier. But, it seems, this is not the limit. Let’s try again to remove unnecessary index calls from the innermost cycle.
def matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = Matrix.zeros((n_xrows, n_ycols))
Yt = Y.transpose() # <--
for i, (Xi, Zi) in enumerate(zip(X, Z)):
for k, Ytk in enumerate(Yt):
Zi[k] = sum(Xi[j] * Ytk[j] for j in range(n_xcols))
return Z
%lprun -f matrix_product bench()
Timer unit: 1e-09 s
Total time: 2.57215 s
File: /tmp/ipykernel_7428/3637370621.py
Function: matrix_product at line 1
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1 def matrix_product(X, Y):
2 16 60198.0 3762.4 0.0 n_xrows, n_xcols = X.shape
3 16 29067.0 1816.7 0.0 n_yrows, n_ycols = Y.shape
4 16 244289.0 15268.1 0.0 Z = Matrix.zeros((n_xrows, n_ycols))
5 16 1107858.0 69241.1 0.0 Yt = Y.transpose() # <--
6 1040 838559.0 806.3 0.0 for i, (Xi, Zi) in enumerate(zip(X, Z)):
7 66560 39077755.0 587.1 1.5 for k, Ytk in enumerate(Yt):
8 65536 2530764392.0 38616.4 98.4 Zi[k] = sum(Xi[j] * Ytk[j] for j in range(n_xcols))
9 16 25251.0 1578.2 0.0 return Z
Numba
Numba does not work with inline lists. Rewrite the matrix_product function using ndarray.
import numba
import numpy as np
@numba.jit
def jit_matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = np.zeros((n_xrows, n_ycols), dtype=X.dtype)
for i in range(n_xrows):
for k in range(n_ycols):
for j in range(n_xcols):
Z[i, k] += X[i, j] * Y[j, k]
return Z
Let's see what happened.
shape = 64, 64
X = np.random.randint(-255, 255, shape)
Y = np.random.randint(-255, 255, shape)
%timeit -n100 jit_matrix_product(X, Y)
The slowest run took 43.72 times longer than the fastest. This could mean that an intermediate result is being cached.
1.52 ms ± 3.18 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
Cython
!pip install Cython
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: Cython in /home/fedorov/.local/lib/python3.12/site-packages (3.2.0)
%load_ext cython
%%capture
%%cython -a
import random
class Matrix(list):
@classmethod
def zeros(cls, shape):
n_rows, n_cols = shape
return cls([[0] * n_cols for i in range(n_rows)])
@classmethod
def random(cls, shape):
M, (n_rows, n_cols) = cls(), shape
for i in range(n_rows):
M.append([random.randint(-255, 255)
for j in range(n_cols)])
return M
def transpose(self):
n_rows, n_cols = self.shape
return self.__class__(zip(*self))
@property
def shape(self):
return ((0, 0) if not self else
(int(len(self)), int(len(self[0]))))
def cy_matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = Matrix.zeros((n_xrows, n_ycols))
Yt = Y.transpose()
for i, Xi in enumerate(X):
for k, Ytk in enumerate(Yt):
Z[i][k] = sum(Xi[j] * Ytk[j] for j in range(n_xcols))
return Z
X = Matrix.random(shape)
Y = Matrix.random(shape)
%timeit -n100 cy_matrix_product(X, Y)
22.7 ms ± 533 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
The problem is that Cython cannot efficiently optimize work with lists that can contain elements of various types, so we rewrite matrix_product using ndarray.
X = np.random.randint(-255, 255, size=shape)
Y = np.random.randint(-255, 255, size=shape)
%%capture
%%cython -a
import numpy as np
def cy_matrix_product(X, Y):
n_xrows, n_xcols = X.shape
n_yrows, n_ycols = Y.shape
Z = np.zeros((n_xrows, n_ycols), dtype=X.dtype)
for i in range(n_xrows):
for k in range(n_ycols):
for j in range(n_xcols):
Z[i, k] += X[i, j] * Y[j, k]
return Z
%timeit -n100 cy_matrix_product(X, Y)
132 ms ± 2.8 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
How so! It just got worse, with most of the code still using Python calls. Let's get rid of them by annotating the code with types.
%%capture
%%cython -a
import numpy as np
cimport numpy as np
def cy_matrix_product(np.ndarray X, np.ndarray Y):
cdef int n_xrows = X.shape[0]
cdef int n_xcols = X.shape[1]
cdef int n_yrows = Y.shape[0]
cdef int n_ycols = Y.shape[1]
cdef np.ndarray Z
Z = np.zeros((n_xrows, n_ycols), dtype=X.dtype)
for i in range(n_xrows):
for k in range(n_ycols):
for j in range(n_xcols):
Z[i, k] += X[i, j] * Y[j, k]
return Z
%timeit -n100 cy_matrix_product(X, Y)
131 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
Unfortunately, typical annotations did not change the run time, because the body of the nested Cython itself could not optimize. Fatality-time: indicate the type of elements in ndarray.
%%capture
%%cython -a
import numpy as np
cimport numpy as np
def cy_matrix_product(np.ndarray[np.int64_t, ndim=2] X,
np.ndarray[np.int64_t, ndim=2] Y):
cdef int n_xrows = X.shape[0]
cdef int n_xcols = X.shape[1]
cdef int n_yrows = Y.shape[0]
cdef int n_ycols = Y.shape[1]
cdef np.ndarray[np.int64_t, ndim=2] Z = \
np.zeros((n_xrows, n_ycols), dtype=np.int64)
for i in range(n_xrows):
for k in range(n_ycols):
for j in range(n_xcols):
Z[i, k] += X[i, j] * Y[j, k]
return Z
%timeit -n100 cy_matrix_product(X, Y)
556 μs ± 8.42 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Let's try to go further and disable checks for going beyond the boundaries of the array and overflow of integer types.
%%capture
%%cython -a
import numpy as np
cimport cython
cimport numpy as np
@cython.boundscheck(False)
@cython.overflowcheck(False)
def cy_matrix_product(np.ndarray[np.int64_t, ndim=2] X,
np.ndarray[np.int64_t, ndim=2] Y):
cdef int n_xrows = X.shape[0]
cdef int n_xcols = X.shape[1]
cdef int n_yrows = Y.shape[0]
cdef int n_ycols = Y.shape[1]
cdef np.ndarray[np.int64_t, ndim=2] Z = \
np.zeros((n_xrows, n_ycols), dtype=np.int64)
for i in range(n_xrows):
for k in range(n_ycols):
for j in range(n_xcols):
Z[i, k] += X[i, j] * Y[j, k]
return Z
%timeit -n100 cy_matrix_product(X, Y)
232 μs ± 1.94 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Numpy
import numpy as np
X = np.random.randint(-255, 255, shape)
Y = np.random.randint(-255, 255, shape)
%timeit -n100 X.dot(Y)
225 μs ± 3.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit -n100 X@Y
215 μs ± 3.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Profit.