Capacity, Overfitting and Underfitting

In [1]:
import numpy as np
import matplotlib.pyplot as plt
In [2]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)
plt.rcParams["figure.titlesize"] = 40
In [3]:
# We will need to generate matrices whose columns are recurrent powers of the first column.
# That boring job is done by this workhorse. Optimization must be done later.
def generate_pow_cols(n,a):
    
    if n == 0:
        return a
    else :
        for i in range(0,n-1):
            tmp = np.array(a[:,0] * a[:,a.shape[1]-1]).reshape(a.shape[0],1)
            a = np.append(a,tmp,axis = 1)
    
    return a
In [4]:
# to test the function above
teste = np.arange(5).reshape(5,1)
print(teste,end = '\n\n')

new_teste = generate_pow_cols(4,teste)
print(new_teste,end='\n\n')
[[0]
 [1]
 [2]
 [3]
 [4]]

[[  0   0   0   0]
 [  1   1   1   1]
 [  2   4   8  16]
 [  3   9  27  81]
 [  4  16  64 256]]

In [5]:
# this generate a array of powers of x. There will be n powers +, raging from 0 to n-1
def generate_pow_array(n,x):
    
    if n == 0 :
        return 1 
    else :
        out = np.ones(n)
        for i in range(0,n):
            out[i] = x**i
        
    return out 

# from a given array of coefficients this function creates a polynomial
def func(coeff , x):
    
    n = coeff.size
    xx = generate_pow_array(n,x)
    out = np.sum(coeff * xx)
    
    return out

# we need to apply the previous function to a column matrix
def func_over_array(coeff , x):
    
    out = np.zeros_like(x)
    
    # we assume that x is a column matrix
    for i in range(0,x.shape[0]):
        out[i,0] = func(coeff,x[i,0])
    
    return out
    
In [6]:
# to test the function it is enough to 
generate_pow_array(10,2)
Out[6]:
array([  1.,   2.,   4.,   8.,  16.,  32.,  64., 128., 256., 512.])
In [7]:
# we generate a fixed set of random coefficients
coef_inf = -3
coef_sup = 2
coef_size = 5
#coef = np.random.uniform(coef_inf,coef_sup,coef_size)
coef = np.ones(coef_size)
coef
Out[7]:
array([1., 1., 1., 1., 1.])
In [8]:
# using this coef array we can compute the value of the function func
func(coef,2)
Out[8]:
31.0
In [9]:
# that when x = 1 corresponds to the sum of the elements of coef
func(coef,1) - np.sum(coef)
Out[9]:
0.0
In [10]:
teste = np.arange(5.0).reshape(5,1)
print(teste)
print(func_over_array(coef,teste))

func(coef,4)
[[0.]
 [1.]
 [2.]
 [3.]
 [4.]]
[[  1.]
 [  5.]
 [ 31.]
 [121.]
 [341.]]
Out[10]:
341.0
In [11]:
%%capture
'''
we need to define a function like this

def funcao(x):
    
    return .3*x**3 + 3.4 * x**2 + 7 * x + 2
'''

Linear models without noise

In [27]:
# the parameters for all the runs
coef_inf = -10 
coef_sup = 10 

# we generate random values of space in the interval [-amplitude,amplitude]
amplitude = 1
In [26]:
# start by generating a set of coefficients for the polynomial function
coef_size = 5
coef = np.random.uniform(coef_inf,coef_sup,coef_size)
coef
Out[26]:
array([-5.90454732, -2.65465255,  3.77066959, -4.92054491, -1.67884587])
In [28]:
# we create the points to train
# the number of points that we are going to sample.
# In the linear case without noise, it is enough to have equality between the number of points and variables
npts_treino = coef.size

xx_treino = np.random.uniform(-amplitude,amplitude,npts_treino).reshape(npts_treino,1)

# we need one less column than coef.size because bellow we will add the o^th power column
data_treino = generate_pow_cols(coef.size-1,xx_treino)

# we need to add a column of ones to take care of the intersection value
bb_treino = np.ones_like(xx_treino)
# and we append it in the first column of data
data_treino = np.append(bb_treino,data_treino,axis = 1)

# from this matrix we create the trasnformation matrix for least squares
matrix = np.linalg.inv(np.matmul(data_treino.transpose(),data_treino))
matrix = np.matmul(matrix,data_treino.transpose())

yy_treino = func_over_array(coef,xx_treino)
ww = np.matmul(matrix,yy_treino)
print(ww)
[[-5.90454732]
 [-2.65465255]
 [ 3.77066959]
 [-4.92054491]
 [-1.67884587]]
In [29]:
# At this moment we repeat all the previous process for the test set
# we generate random values of space in the interval [-amplitude,amplitude]
amplitude = 5
# the number of points that we are going to sample for the test.
npts_teste = 20

xx_teste = np.random.uniform(-amplitude,amplitude,npts_teste).reshape(npts_teste,1)

bb_teste = np.ones_like(xx_teste)
data_teste = generate_pow_cols(coef.size-1,xx_teste)
data_teste = np.append(bb_teste,data_teste,axis = 1)

# this is the same of computing X^T \cdot w = \hat y
yy_hat = np.matmul(data_teste,ww)

# the correct value of the test points
yy_teste = func_over_array(coef,xx_teste)

result = np.append(yy_hat,yy_teste,axis=1)
print(result)
[[ 2.64973422e+01  2.64973422e+01]
 [-1.12214358e+02 -1.12214358e+02]
 [ 1.36046256e+01  1.36046256e+01]
 [-5.05055271e+01 -5.05055271e+01]
 [ 9.51866037e+00  9.51866034e+00]
 [-6.89330339e+01 -6.89330339e+01]
 [ 1.90791984e+01  1.90791984e+01]
 [-6.16097227e+00 -6.16097227e+00]
 [-9.89308591e+01 -9.89308591e+01]
 [-7.97551805e+02 -7.97551806e+02]
 [-1.14955672e+03 -1.14955672e+03]
 [-5.22087859e+00 -5.22087859e+00]
 [ 3.60711067e+01  3.60711067e+01]
 [-4.08012478e+02 -4.08012478e+02]
 [-4.25156711e+00 -4.25156711e+00]
 [ 8.57600943e-01  8.57600943e-01]
 [-7.20621863e+00 -7.20621863e+00]
 [-3.93733600e+01 -3.93733600e+01]
 [ 1.71316861e+01  1.71316861e+01]
 [-6.62702842e+02 -6.62702843e+02]]
In [30]:
# computing the mean square error
np.square(yy_hat - yy_teste).mean()
Out[30]:
3.2437336676460084e-15
In [16]:
coef
Out[16]:
array([-2.55618136, -0.76594647,  0.2353325 ,  1.18516978, -0.64702227])
In [17]:
xp = np.linspace(-5,5,100)

# do not forget that the input x of func_over_array is a column matrix
yp = func_over_array(coef,xp.reshape(xp.size,1))
plt.plot(xp,yp,c = 'k')
plt.scatter(xx_treino,yy_treino, c = 'r',alpha = 1,linewidths = 3)
plt.scatter(xx_teste,yy_hat, c = 'y', alpha = 1, linewidths = 3)
plt.show()

Lower Capacity than necessary leads to Underfitting

In [41]:
# this induces later that the generator polynomial will have degree coef2_size - 1
coef2_size = 7
coef2 = np.random.uniform(coef_inf,coef_sup,coef2_size)
coef2
Out[41]:
array([3.97271086, 8.71167471, 4.86138134, 9.60402323, 2.40489549,
       4.41243458, 5.2854752 ])
In [42]:
# the number of points that we are going to sample.
# Notice that we are generating the double of points to train in relation to first example
npts_treino2 = 2 * coef2.size
# in opposition to what happened in the previous case, the values of coefficients did not dependended
# on the trainning data. This is not the case here.
xx_treino2 = np.random.uniform(-amplitude,amplitude,npts_treino2).reshape(npts_treino2,1)

# Because our goal is to low capacity, in this case this corresponds to lower degree polynomial approximation
data_treino2 = generate_pow_cols(coef2.size-4,xx_treino2)

# like before
bb_treino2 = np.ones_like(xx_treino2)
# and we append it in the first column of data
data_treino2 = np.append(bb_treino2,data_treino2,axis = 1)

# from this matrix we create the trasnformation matrix for least squares
matrix2 = np.linalg.inv(np.matmul(data_treino2.transpose(),data_treino2))
matrix2 = np.matmul(matrix2,data_treino2.transpose())

yy_treino2 = func_over_array(coef2,xx_treino2)
ww2 = np.matmul(matrix2,yy_treino2)
print(ww2)
[[-3548.30767408]
 [ 2879.5127527 ]
 [ 1522.33436787]
 [ -152.99132286]]
In [43]:
# as opposed to what happened in the previous scenario, there is a difference between the 
xp2 = np.linspace(-amplitude,amplitude,100)
yp2 = func_over_array(ww2,xp2.reshape(xp2.size,1))

plt.plot(xp2,yp2,c='k')
plt.scatter(xx_treino2,yy_treino2,c='r',linewidths=3)
plt.title('Lower Capacity leads to Underfitting')
plt.show()

Higher Capacity than necessary leads to Overfitting

We repeat the process but at this time, with exactly the same model data generator, we make a higher degree approximation.

In [44]:
npts_treino3 = coef2.size
xx_treino3 = np.random.uniform(-amplitude,amplitude,npts_treino3).reshape(npts_treino3,1)

# Because our goal is to increase capacity, in this case this corresponds to higher degree polynomial approximation
data_treino3 = generate_pow_cols(coef2.size + 1,xx_treino3)

bb_treino3 = np.ones_like(xx_treino3)
data_treino3 = np.append(bb_treino3,data_treino3,axis = 1)

matrix3 = np.linalg.inv(np.matmul(data_treino3.transpose(),data_treino3))
matrix3 = np.matmul(matrix3,data_treino3.transpose())

yy_treino3 = func_over_array(coef2,xx_treino3)
ww3 = np.matmul(matrix3,yy_treino3)
print(ww3)
[[-1.00907830e+04]
 [ 2.92418895e+02]
 [-2.10405782e+02]
 [-1.18126316e+02]
 [ 8.12131357e+01]
 [ 3.35337150e+01]
 [-2.69152755e-01]
 [-3.14046928e-01]
 [ 3.01425893e-01]]
In [45]:
xp3 = np.linspace(-amplitude,amplitude,100)
yp3 = func_over_array(coef2,xp3.reshape(xp3.size,1))

plt.plot(xp3,yp3,c='k')
plt.scatter(xx_treino3,yy_treino3,c='r',linewidths=3)
plt.title('Increasing the Capacity we increase the fit over the trainning data.')
plt.show()

There is a perfect fit between the approximation function and the trainning data. The overfitting appears when we try to generalize the approximation to the test set.

In [47]:
# the number of points that we are going to sample for the test.
npts_teste3 = 20

xx_teste3 = np.random.uniform(-amplitude3,amplitude3,npts_teste3).reshape(npts_teste3,1)

bb_teste3 = np.ones_like(xx_teste3)
data_teste3 = generate_pow_cols(coef2.size + 1,xx_teste3)
data_teste3 = np.append(bb_teste3,data_teste3,axis = 1)

# this is the same of computing X^T \cdot w = \hat y
yy_hat3 = np.matmul(data_teste3,ww3)

# the correct value of the test points
yy_teste3 = func_over_array(coef2,xx_teste3)

result = np.append(yy_hat3,yy_teste3,axis=1)
print(result,end='\n\n')

# the mean square error
print("The mean square error is %f.\n" % np.square(yy_teste3 - yy_hat3).mean())
[[ 9.31145671e+05  3.69333707e+05]
 [ 4.19199147e+06  1.10825130e+06]
 [ 7.52457002e+05  3.16843476e+05]
 [ 1.92107649e+06  6.24869997e+05]
 [ 2.79212499e+07  5.34972885e+06]
 [-1.00850520e+04  3.65137180e+02]
 [ 6.27259857e+06  1.67242026e+06]
 [ 5.53959014e+03  7.33691244e+03]
 [ 1.85586042e+05  1.18641869e+05]
 [ 4.29954161e+05  1.81433583e+05]
 [ 3.79231267e+06  1.02931873e+06]
 [-1.01177241e+04  3.24562254e+00]
 [-9.46107987e+03  1.46064812e+03]
 [ 6.63473797e+05  2.63758994e+05]
 [ 2.12869494e+05  9.80187734e+04]
 [ 2.24756939e+07  3.86117612e+06]
 [ 1.68303741e+06  5.67367288e+05]
 [ 9.29430279e+06  1.99876724e+06]
 [ 1.32211659e+07  2.59823994e+06]
 [-1.00034545e+04  1.41969652e+01]]

The mean square error is 53201018202161.671875.

In [49]:
xp3 = np.linspace(-amplitude3,amplitude3,100)
yp3 = func_over_array(coef2,xp3.reshape(xp3.size,1))

plt.plot(xp3,yp3,c='k')
plt.scatter(xx_treino3,yy_treino3,c='r',linewidths=3)
plt.scatter(xx_teste3,yy_hat3,c='g',linewidths=3)
plt.title('Increasing the Capacity we reduce the fit over the trainning data.')
plt.show()