This jupyter file goes along the most common functions of Numpy for Data Analysis. It does have the pretension to be a tutorial or a complete guide. Just go along, see the inputs and outputs. In a machine learning method, you will be able to use Python as a computational tool for analysing data. The structure of the instructions follows the book \textbf{Python for Data Analysis}, by Wes McKinney.
import numpy as np
import matplotlib.pyplot as plt
data1 = [1,2.0,3.0,4.5]
arr1 = np.array(data1)
print(data1)
print(arr1)
arr2 = np.arange(10)
print(arr2)
# this instruction creates an array of ones with the same shape of the input
arr3 = np.ones_like(arr2)
print(arr3)
arr2_slice = arr3[3:7]
print(arr2_slice)
# now change the value of the first element of the slice
arr2_slice[0] = 3
print(arr3)
# the surprising comes from the fact that the python does not copy but simply give us a view of the data_type from
# which it is defined. One way to do that is to explicitly copy the elements of the array
arr3_slice = arr3[3:7].copy()
arr3_slice[:] = 10
# this will not change the value of the original array
print(arr3_slice)
print(arr3)
arr2d = np.array([[1,2,3],[2.0,3,1]])
# much more interesting is to create a 3d array (or an array of 2d arrays)
arr3d = np.array([arr2d,arr2d])
arr3d[0]
# broadcasting is a efficient technique of Python because it directs the loop into the C-core!
arr3d[1,0]=12
print(arr3d)
# indexing is a big thing in Python
names = np.array(['Patita','Pipa','Isa','teste','Patita','Pipa','Isa'])
# notice that the number of entries of names is equal to the number of rows in data bellow
data = np.random.randn(7,4)
print(data)
print()
print(data[names == 'Patita'])
print()
print(data[names != 'Patita'])
print()
mask = (names == 'Patita')|(names == 'Pipa')
print(data[mask])
# we can compose functions in order to produce, for example, a array with different shapes other than unidimensional.
arr = np.arange(30).reshape(10,3)
print(arr)
print()
print(np.matmul(arr.T,arr))
# numpy mesh grid gets a unidimensional range and produce the pair of points from these values
precision = np.arange(-5,5,.1)
xp , yp = np.meshgrid(precision,precision)
xp
# we can now compute the radius of the points in the grid
# this is an example of vectorization
z = np.sqrt(xp**2 + yp**2)
fig = plt.figure(figsize=(10, 10))
plt.imshow(z,cmap = plt.cm.gray)
plt.title("Image of $\sqrt{x^2+y^2}$ for the grid")
plt.show()
# to better understand the np.where command (and usefulness) the next example explores this
xarr = np.array([1.2,1.2,1.3,1.5])
yarr = np.array([3.4,5.6,3.4,9.3])
print(xarr)
print(yarr)
print()
# this function transform a uniform(0,1) sample in binary array
def escolhabin(in_arr):
out = np.ones_like(in_arr)
for i in range(len(in_arr)):
if (in_arr[i] < 0.5):
out[i] = 0
return out
sample = np.random.uniform(0,1,len(xarr))
print(sample)
print(escolhabin(sample))
# now we produce some conditioning
cond = np.array(sample > .5)
print(cond)
print()
# and use this conditions to create a new array in a pythonic way
# the zip command produces an iterator and stops at the smallest length of the arrays over which we are iterating
result = np.array([(x if c else y) for x,y,c in zip(xarr,yarr,cond)])
print()
print(result)
# the same result is also achieved in a much more simple way by the use of np.where
result = np.where(cond,xarr,yarr)
print(result)
# this makes the use of escolhabin completely useless because the same result may be compute by
sample = np.where(np.random.uniform(0,1,len(xarr)) > 0.5, 1 , 0)
print(sample)
cond = np.array(sample > 0.5)
print(cond)
result = np.where(cond , xarr, yarr)
print(result)
# we can also keep the thing untouched
tmparr = np.random.randn(10)
sample = np.where(tmparr > 0 , 1 , tmparr)
print(sample)
# we can take advantage from the fact that Python is an object oriented language
print(np.mean(sample))
# produces exactly the same result as
print(sample.mean())
# in a more general case we use along the axis of the arrays
sample = np.random.randn(30).reshape(10,3)
# the mean and the std over the columns
print(sample.mean(axis=0),sample.std(axis=0),end='\n\n')
# the mean and the std of the rows
print(sample.mean(axis=1),end='\n\n')
print(sample.std(axis=1))
# One quite useful function is the argmin (or the argmax)
# recall that the counter rolls over the rows of a 2d array
print(sample.min())
print(sample.argmin(),end='\n\n')
print(sample)
# we define an array and save it to the binary file lixo.npy ("lixo" : garbage)
sample = np.random.randn(100).reshape(20,5)
print(sample[0:5])
np.save('lixo',sample)
# as you can imagine the inverse process is returned via np.load function
sample = np.load('lixo.npy')
print(sample[0:5])
# we may achieve more by giving keys to which is saved and reused at loading
sample_uni = np.random.uniform(0,1,100).reshape(5,20)
np.savez('lixo.npz', gauss = sample , uniforme = sample_uni)
teste = np.load('lixo.npz')
print(teste['gauss'][0:5],end='\n\n')
print(teste['uniforme'][0:2])
# life is much easier with linear algebra framework
matrox_a = np.random.randn(16).reshape(4,4)
matrox_b = np.random.randn(16).reshape(4,4)
matrox_c = np.matmul(matrox_a.transpose(),matrox_a)
print(matrox_c,end='\n\n')
# keepdims = True allows to broadcast in order to compute X-E(X)
medias_c_col = matrox_c.mean(axis=0, keepdims=True)
print('For the given matrix:\n')
print(matrox_c,end='\n\n')
print('We compute the mean value over the columns:\n')
print(medias_c_col,end='\n\n')
matrox_c_centered = matrox_c - medias_c_col
print('By broadcasting we can compute X-E(X)\n')
print(matrox_c_centered,end='\n\n')
divisor = matrox_c_centered.shape[0] - 1
print('Doing X^T X and dividing by N-1 we obtain the covariance matrix')
cov_c = np.matmul(matrox_c_centered.T,matrox_c_centered)/divisor
# this produces the covariance matrix of matrox_c
print(cov_c,end='\n\n')
# in future we will use the much more efficient process
print(np.cov(matrox_c),end='\n\n')
# now we compute the standard deviations over the columns and define a diagonal matrix with them
std_c_col = np.sqrt(np.diagonal(cov_c))
std_c = np.zeros_like(matrox_c)
# now we fill the diagonal with the std of the columns
np.fill_diagonal(std_c,std_c_col)
# what we really want is to compute the inverse of this matrix
inv_std_c = np.linalg.inv(std_c)
print('We use this matrix to compute the correlation matrix:\n')
print(inv_std_c,end='\n\n')
print('The correlation matrix is given by:\n')
corr_c = np.matmul(inv_std_c,np.matmul(cov_c,inv_std_c))
print(corr_c,end='\n\n')
print('Which could be easily computed by the instruction np.corrcoef producing:\n')
print(np.corrcoef(matrox_c))
# we can now compute the correlation matrix