Numpy Crash Course

This jupyter file goes along the most common functions of Numpy for Data Analysis. It does have the pretension to be a tutorial or a complete guide. Just go along, see the inputs and outputs. In a machine learning method, you will be able to use Python as a computational tool for analysing data. The structure of the instructions follows the book \textbf{Python for Data Analysis}, by Wes McKinney.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
In [2]:
data1 = [1,2.0,3.0,4.5]
arr1 = np.array(data1)
print(data1)
print(arr1)
[1, 2.0, 3.0, 4.5]
[1.  2.  3.  4.5]
In [3]:
arr2 = np.arange(10)
print(arr2)


# this instruction creates an array of ones with the same shape of the input
arr3 = np.ones_like(arr2)
print(arr3)
[0 1 2 3 4 5 6 7 8 9]
[1 1 1 1 1 1 1 1 1 1]
In [4]:
arr2_slice = arr3[3:7]
print(arr2_slice)

# now change the value of the first element of the slice
arr2_slice[0] = 3

print(arr3)

# the surprising comes from the fact that the python does not copy but simply give us a view of the data_type from
# which it is defined. One way to do that is to explicitly copy the elements of the array

arr3_slice = arr3[3:7].copy()
arr3_slice[:] = 10

# this will not change the value of the original array
print(arr3_slice)
print(arr3)
[1 1 1 1]
[1 1 1 3 1 1 1 1 1 1]
[10 10 10 10]
[1 1 1 3 1 1 1 1 1 1]
In [5]:
arr2d = np.array([[1,2,3],[2.0,3,1]])

# much more interesting is to create a 3d array (or an array of 2d arrays)
arr3d = np.array([arr2d,arr2d])
arr3d[0]

# broadcasting is a efficient technique of Python because it directs the loop into the C-core!
arr3d[1,0]=12

print(arr3d)
[[[ 1.  2.  3.]
  [ 2.  3.  1.]]

 [[12. 12. 12.]
  [ 2.  3.  1.]]]
In [6]:
# indexing is a big thing in Python
names = np.array(['Patita','Pipa','Isa','teste','Patita','Pipa','Isa'])

# notice that the number of entries of names is equal to the number of rows in data bellow
data = np.random.randn(7,4)
print(data)
print()
print(data[names == 'Patita'])
print()
print(data[names != 'Patita'])
print()

mask = (names == 'Patita')|(names == 'Pipa')
print(data[mask])
[[ 1.07269875  1.24516675 -0.05024504 -0.65095377]
 [ 1.37615077 -0.64342956 -0.08639359  0.0960959 ]
 [ 0.43348222  2.18385295  0.76885986 -0.33497242]
 [-1.62197836 -2.05821145 -1.18079852  0.25987106]
 [-0.26683546 -0.30071784 -0.94769455 -0.17344232]
 [-0.75855823  0.67285195 -0.99733018  0.90678683]
 [-0.86378795 -0.83925107  0.66250357 -1.42641447]]

[[ 1.07269875  1.24516675 -0.05024504 -0.65095377]
 [-0.26683546 -0.30071784 -0.94769455 -0.17344232]]

[[ 1.37615077 -0.64342956 -0.08639359  0.0960959 ]
 [ 0.43348222  2.18385295  0.76885986 -0.33497242]
 [-1.62197836 -2.05821145 -1.18079852  0.25987106]
 [-0.75855823  0.67285195 -0.99733018  0.90678683]
 [-0.86378795 -0.83925107  0.66250357 -1.42641447]]

[[ 1.07269875  1.24516675 -0.05024504 -0.65095377]
 [ 1.37615077 -0.64342956 -0.08639359  0.0960959 ]
 [-0.26683546 -0.30071784 -0.94769455 -0.17344232]
 [-0.75855823  0.67285195 -0.99733018  0.90678683]]
In [7]:
# we can compose functions in order to produce, for example, a array with different shapes other than unidimensional.
arr = np.arange(30).reshape(10,3)
print(arr)
print()
print(np.matmul(arr.T,arr))
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [12 13 14]
 [15 16 17]
 [18 19 20]
 [21 22 23]
 [24 25 26]
 [27 28 29]]

[[2565 2700 2835]
 [2700 2845 2990]
 [2835 2990 3145]]
In [8]:
# numpy mesh grid gets a unidimensional range and produce the pair of points from these values

precision = np.arange(-5,5,.1)
xp , yp = np.meshgrid(precision,precision)
In [9]:
xp
Out[9]:
array([[-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9],
       [-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9],
       [-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9],
       ...,
       [-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9],
       [-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9],
       [-5. , -4.9, -4.8, ...,  4.7,  4.8,  4.9]])
In [10]:
# we can now compute the radius of the points in the grid
# this is an example of vectorization

z = np.sqrt(xp**2 + yp**2)

fig = plt.figure(figsize=(10, 10))
plt.imshow(z,cmap = plt.cm.gray)
plt.title("Image of $\sqrt{x^2+y^2}$ for the grid")

plt.show()
In [11]:
# to better understand the np.where command (and usefulness) the next example explores this
xarr = np.array([1.2,1.2,1.3,1.5])
yarr = np.array([3.4,5.6,3.4,9.3])
print(xarr)
print(yarr)
print()

# this function transform a uniform(0,1) sample in binary array 
def escolhabin(in_arr):
    out = np.ones_like(in_arr)
    
    for i in range(len(in_arr)):
        if (in_arr[i] < 0.5):
            out[i] = 0

    return out 

sample = np.random.uniform(0,1,len(xarr))
print(sample)
print(escolhabin(sample))

# now we produce some conditioning
cond = np.array(sample > .5)
print(cond)
print()
# and use this conditions to create a new array in a pythonic way
# the zip command produces an iterator and stops at the smallest length of the arrays over which we are iterating
result = np.array([(x if c else y) for x,y,c in zip(xarr,yarr,cond)])

print()
print(result)
[1.2 1.2 1.3 1.5]
[3.4 5.6 3.4 9.3]

[0.96162936 0.44318557 0.21900146 0.66620832]
[1. 0. 0. 1.]
[ True False False  True]


[1.2 5.6 3.4 1.5]
In [12]:
# the same result is also achieved in a much more simple way by the use of np.where
result = np.where(cond,xarr,yarr)
print(result)
[1.2 5.6 3.4 1.5]
In [13]:
# this makes the use of escolhabin completely useless because the same result may be compute by
sample = np.where(np.random.uniform(0,1,len(xarr)) > 0.5, 1 , 0)
print(sample)

cond = np.array(sample > 0.5)
print(cond)

result = np.where(cond , xarr, yarr)
print(result)
[0 1 1 0]
[False  True  True False]
[3.4 1.2 1.3 9.3]
In [14]:
# we can also keep the thing untouched
tmparr = np.random.randn(10)
sample = np.where(tmparr > 0 , 1 , tmparr)
print(sample)
[ 1.         -2.31822889  1.          1.          1.         -0.4379
  1.          1.          1.          1.        ]
In [15]:
# we can take advantage from the fact that Python is an object oriented language
print(np.mean(sample))

# produces exactly the same result as
print(sample.mean())
0.5243871110845418
0.5243871110845418
In [16]:
# in a more general case we use along the axis of the arrays
sample = np.random.randn(30).reshape(10,3)
# the mean and the std over the columns
print(sample.mean(axis=0),sample.std(axis=0),end='\n\n')

# the mean and the std of the rows
print(sample.mean(axis=1),end='\n\n')
print(sample.std(axis=1))
[ 0.60472041  0.18930064 -0.26617673] [0.88611971 0.78131811 0.94195054]

[ 0.10414592  0.63143157 -0.02973066  0.66446524 -0.54020573  0.16669316
  0.26914265  0.42726363 -0.16590631  0.23218159]

[1.23648875 0.51800304 0.5319361  0.71850467 0.83800666 1.32430173
 0.67787437 1.0801551  0.2974647  0.95350762]
In [17]:
# One quite useful function is the argmin (or the argmax)
# recall that the counter rolls over the rows of a 2d array
print(sample.min())
print(sample.argmin(),end='\n\n')


print(sample)
-1.623870036242381
2

[[ 1.20016098  0.73614683 -1.62387004]
 [ 1.02729598  0.96731511 -0.10031638]
 [-0.59545271 -0.17629128  0.682552  ]
 [-0.0458466   0.39036268  1.64887966]
 [ 0.47971711 -0.52748101 -1.5728533 ]
 [ 2.03830536 -0.71026198 -0.8279639 ]
 [ 1.20487918 -0.01825861 -0.37919263]
 [-0.52774754  1.93727481 -0.12773639]
 [-0.29985864 -0.44428569  0.2464254 ]
 [ 1.56575098 -0.26151447 -0.60769175]]

Save and Load Data to and from files

In [18]:
# we define an array and save it to the binary file lixo.npy ("lixo" : garbage)

sample = np.random.randn(100).reshape(20,5)
print(sample[0:5])
np.save('lixo',sample)
[[-0.60029385  0.37930252 -0.82189761 -0.36942611  0.95435642]
 [ 0.35983231 -0.44485853 -0.66568431  0.76497051  1.21789366]
 [-1.35636581  0.41113401  2.38877906  1.08167256 -0.13637988]
 [-0.12809221 -0.77460148 -0.76400274 -0.68346764  0.08953136]
 [-1.13554511  0.67209975 -0.14722805 -0.44111451 -0.6452894 ]]
In [19]:
# as you can imagine the inverse process is returned via np.load function

sample = np.load('lixo.npy')
print(sample[0:5])
[[-0.60029385  0.37930252 -0.82189761 -0.36942611  0.95435642]
 [ 0.35983231 -0.44485853 -0.66568431  0.76497051  1.21789366]
 [-1.35636581  0.41113401  2.38877906  1.08167256 -0.13637988]
 [-0.12809221 -0.77460148 -0.76400274 -0.68346764  0.08953136]
 [-1.13554511  0.67209975 -0.14722805 -0.44111451 -0.6452894 ]]
In [20]:
# we may achieve more by giving keys to which is saved and reused at loading

sample_uni = np.random.uniform(0,1,100).reshape(5,20)

np.savez('lixo.npz', gauss = sample , uniforme = sample_uni)
In [21]:
teste = np.load('lixo.npz')
print(teste['gauss'][0:5],end='\n\n')
print(teste['uniforme'][0:2])
[[-0.60029385  0.37930252 -0.82189761 -0.36942611  0.95435642]
 [ 0.35983231 -0.44485853 -0.66568431  0.76497051  1.21789366]
 [-1.35636581  0.41113401  2.38877906  1.08167256 -0.13637988]
 [-0.12809221 -0.77460148 -0.76400274 -0.68346764  0.08953136]
 [-1.13554511  0.67209975 -0.14722805 -0.44111451 -0.6452894 ]]

[[0.51880925 0.93049102 0.16619477 0.19870355 0.20386014 0.72513994
  0.47092723 0.52061101 0.79358421 0.59798903 0.01111465 0.55576177
  0.0079344  0.06363829 0.44517665 0.60079153 0.34963809 0.13242132
  0.55762246 0.75766884]
 [0.75369493 0.60986365 0.99259774 0.17358255 0.19059754 0.40473343
  0.33763723 0.64297082 0.56049389 0.52914627 0.62966138 0.60151565
  0.16268293 0.8978313  0.11163617 0.21824921 0.09798013 0.34457175
  0.54206904 0.0898361 ]]

Linear Algebra in a World of Objects

In [22]:
# life is much easier with linear algebra framework

matrox_a = np.random.randn(16).reshape(4,4)
matrox_b = np.random.randn(16).reshape(4,4)

matrox_c = np.matmul(matrox_a.transpose(),matrox_a)
print(matrox_c,end='\n\n')
[[ 4.67504928  0.92618853 -2.07977419 -1.04294985]
 [ 0.92618853  2.53010689 -2.72975834 -2.00547937]
 [-2.07977419 -2.72975834  3.39084018  1.70578653]
 [-1.04294985 -2.00547937  1.70578653  3.26334473]]

In [23]:
# keepdims = True allows to broadcast in order to compute X-E(X)
medias_c_col = matrox_c.mean(axis=0, keepdims=True)
print('For the given matrix:\n')
print(matrox_c,end='\n\n')
print('We compute the mean value over the columns:\n')
print(medias_c_col,end='\n\n')

matrox_c_centered = matrox_c - medias_c_col

print('By broadcasting we can compute X-E(X)\n')
print(matrox_c_centered,end='\n\n')

divisor = matrox_c_centered.shape[0] - 1

print('Doing X^T X and dividing by N-1 we obtain the covariance matrix')
cov_c = np.matmul(matrox_c_centered.T,matrox_c_centered)/divisor

# this produces the covariance matrix of matrox_c
print(cov_c,end='\n\n')

# in future we will use the much more efficient process
print(np.cov(matrox_c),end='\n\n')
For the given matrix:

[[ 4.67504928  0.92618853 -2.07977419 -1.04294985]
 [ 0.92618853  2.53010689 -2.72975834 -2.00547937]
 [-2.07977419 -2.72975834  3.39084018  1.70578653]
 [-1.04294985 -2.00547937  1.70578653  3.26334473]]

We compute the mean value over the columns:

[[ 0.61962844 -0.31973557  0.07177354  0.48017551]]

By broadcasting we can compute X-E(X)

[[ 4.05542084  1.2459241  -2.15154773 -1.52312536]
 [ 0.30656009  2.84984247 -2.80153188 -2.48565488]
 [-2.69940263 -2.41002276  3.31906663  1.22561102]
 [-1.66257829 -1.6857438   1.63401298  2.78316922]]

Doing X^T X and dividing by N-1 we obtain the covariance matrix
[[ 8.86378614  5.07823246 -7.0868137  -4.95819041]
 [ 5.07823246  6.10795695 -7.13938104 -5.54229472]
 [-7.0868137  -7.13938104  8.72131343  6.28544586]
 [-4.95819041 -5.54229472  6.28544586  5.91551478]]

[[ 8.86378614  5.07823246 -7.0868137  -4.95819041]
 [ 5.07823246  6.10795695 -7.13938104 -5.54229472]
 [-7.0868137  -7.13938104  8.72131343  6.28544586]
 [-4.95819041 -5.54229472  6.28544586  5.91551478]]

In [24]:
# now we compute the standard deviations over the columns and define a diagonal matrix with them
std_c_col = np.sqrt(np.diagonal(cov_c))

std_c = np.zeros_like(matrox_c)
# now we fill the diagonal with the std of the columns
np.fill_diagonal(std_c,std_c_col)

# what we really want is to compute the inverse of this matrix
inv_std_c = np.linalg.inv(std_c)

print('We use this matrix to compute the correlation matrix:\n')
print(inv_std_c,end='\n\n')

print('The correlation matrix is given by:\n')

corr_c = np.matmul(inv_std_c,np.matmul(cov_c,inv_std_c))
print(corr_c,end='\n\n')

print('Which could be easily computed by the instruction np.corrcoef producing:\n')
print(np.corrcoef(matrox_c))
# we can now compute the correlation matrix
We use this matrix to compute the correlation matrix:

[[0.33588481 0.         0.         0.        ]
 [0.         0.40462435 0.         0.        ]
 [0.         0.         0.33861723 0.        ]
 [0.         0.         0.         0.41115325]]

The correlation matrix is given by:

[[ 1.          0.69016822 -0.80602857 -0.68472675]
 [ 0.69016822  1.         -0.97818642 -0.92203066]
 [-0.80602857 -0.97818642  1.          0.87508224]
 [-0.68472675 -0.92203066  0.87508224  1.        ]]

Which could be easily computed by the instruction np.corrcoef producing:

[[ 1.          0.69016822 -0.80602857 -0.68472675]
 [ 0.69016822  1.         -0.97818642 -0.92203066]
 [-0.80602857 -0.97818642  1.          0.87508224]
 [-0.68472675 -0.92203066  0.87508224  1.        ]]