import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0, 8.0)
def IntToBin(number):
# this function returns this list
out = []
run = number
while(run > 0):
run , resto = divmod(run,2)
out.insert(0,resto)
return out
def BinToInt(lista):
out = 0
expoente = len(lista) - 1
for i in range(len(lista)):
out += lista[i] * (2**expoente)
expoente -= 1
return out
One common operation between numbers, specially simple if they are already in binary format, is the xor!
def XorLists(lista_a , lista_b):
# we copy the lists and help_a will be the longer one
if len(lista_a) > len(lista_b):
help_a = lista_a
help_b = lista_b
else :
help_a = lista_b
help_b = lista_a
# we add 0's in the front of the shorter until both have equal length
while (len(help_b) < len(help_a)):
help_b.insert(0,0)
# now we perform the xor over the correspondent elements of both lists
return list(map(lambda x , y : x ^ y , help_a , help_b))
lista100 = IntToBin(100)
lista12343 = IntToBin(12343)
xorlista = XorLists(lista100 , lista12343)
BinToInt(xorlista)
What we have done corresponds indeed to the usual concept of xor between two integers in Python.
print(100 ^ 12343)
print(12343 ^ 100)
expoente = IntToBin(100)
expoente
3**BinToInt(expoente)
base = 3
resultado = 1
for i in range(len(expoente)- 1 , -1 , -1 ):
if (expoente[i] == 1):
resultado *= base
base *= base
print(base , resultado)
def signedCodification(x):
out = -1
if (x > 0.5):
out = 1
return int(out)
cumsum = 0
ngames = 2000
game_length = 10
values = []
for i in range(ngames):
cumsum = 0
for j in range(game_length):
cumsum += signedCodification(random.random())*2**j
values.append(cumsum)
after_values = list(set([abs(x) for x in values]))
after_values.sort()
print(after_values)
Is indeed very easy to see that we can get any odd integer using this method. Basically every odd number, because is an integer, has a binary expansion with the last term always equal to 1.
IntToBin(19)
BinToInt([1,1,-1,-1,1])
IntToBin(245)
BinToInt([1, 1, 1, 1, 1, -1, 1, -1])
\textbf{Question:} can you generalize the idea of this algorithm and from that generalization create a signed binary expansion for any odd integer?
In the first version of the game we assume that the number of draws is fixed. Suppose that $n=10$.
def HalfGameRun(ndraws):
def codification(x):
if (x > 0.5):
return 1
else:
return -1
tmp = np.array(list(map(lambda x : codification(x) , np.random.uniform(size=ndraws))))
out = np.multiply(np.array([1.0/2**i for i in range(ndraws)]),tmp)
return out
def DoubleGameRun(ndraws):
def codification(x):
if (x > 0.5):
return 1
else:
return -1
tmp = np.array(list(map(lambda x : codification(x) , np.random.uniform(size=ndraws))))
out = np.multiply(np.array([2**i for i in range(ndraws)]),tmp)
return out
np.sum(DoubleGameRun(100))
ndraws = 100
nsimul = 100000
simulation = np.array([])
for i in range(nsimul):
simulation = np.append(simulation,np.sum(DoubleGameRun(ndraws)))
print(np.mean(simulation))
print(np.std(simulation))
This game is very simple and it is based in a simple repetition of Heads and Tails. You start with €1, you choose one of the possible results (we will suppose Heads) and you have to pay €1 each time you make a play. If the result of the draw is Heads you receive €2, otherwise you receive nothing. The game stops when you run out of cash. Wins the player who gets a longer series.
After $n$ draws, if positive, your cash will be $2S_n-n$, where $S_n=X_0+\dots+X_{n-1}$ and $X_k=1$ if the result is Heads and $0$ otherwise. Again, we start by generating a random number in the interval $[0,1)$ and convert it to $0$ or $1$.
def codification(x):
if (x > 0.5):
return 1
else:
return 0
Because we do not know in advance how many numbers we will need to generate, the most efficient method is to use the \textbf{random} library.
import random
One simple experiment of the game will be given by the following code.
ndraws = 1
currentvalue = 2*codification(random.random()) - 1
while(currentvalue != 0):
currentvalue += 2*codification(random.random()) - 1
ndraws += 1
print("The number of draws until we comeback to zero: %d" %ndraws)
Putting everything in a cycle we will simulate the game several times.
nsimuls = 10000
resultados = []
for i in range(nsimuls):
currentvalue = 2 * codification(random.random()) - 1
ndraws = 1
while (currentvalue != 0):
currentvalue += 2 * codification(random.random()) - 1
ndraws += 1
resultados.append(ndraws)
print(min(resultados),max(resultados))
# in order the algorithm works perfectly you need to sort the list
resultados.sort()
array_resultados = np.array([[],[]],dtype=np.int64).transpose()
number = resultados[0]
count = 1 ;
for i in range(0,len(resultados)):
if (resultados[i]==number):
count += 1
else :
array_resultados = np.append(array_resultados,[[number,count]],axis = 0)
number = resultados[i]
count = 1
print(array_resultados)
# array_resultados = np.array([[],[]],dtype=np.int64).transpose()
# for k in range(min(resultados),max(resultados)+1):
# count_k = resultados.count(k)
# #len([i for i, j in enumerate(resultados) if j == k])
# if (count_k > 0):
# array_resultados = np.append(array_resultados,[[k,count_k]],axis = 0)
# print(array_resultados.shape)
tamanho = int(array_resultados.shape[0]/10)
coord_x = array_resultados[0:tamanho,0]
coord_y = (1.0/nsimuls) * array_resultados[0:tamanho,1]
# the series to which we want to make a comparison
comp_y = np.array([1.0/i**2 for i in coord_x])
plt.figure(figsize=(12,8))
plt.plot(coord_x,coord_y)
plt.plot(coord_x,comp_y)
plt.show()
file_results = open("GetEvenSimulations.txt" , "r")
resultados = np.array([int(x) for x in file_results])
resultados[0:10]
print(min(resultados),max(resultados))
m = max(resultados)
[i for i, j in enumerate(resultados) if j == m]
resultados[30928]
bignumbers = [x for x in resultados if x > 1000000]
print(len(bignumbers))
Suppose that we have a sample, call it $(x_i)$ with $i\in\{0,\dots,N-1\}$, and we want to estimate its mean value, call it mu. The usual method to do it is to use the sample mean from which we obtain\
\begin{equation}\label{eq:sgd.10} \hat\mu_N = \frac{1}{N}\sum_{i=0}^{N-1} x_i. \end{equation}Plain and simple. Now suppose that $N$ is very large, really, really large and you have a old, really, really old computer. Your computer is so old that it does not have enough memmory to load the array on it (and you do not know how to do incremental mean). Because your sample is really big, it comes to you that perhaps you do not need to compute the sum of all elements. The idea is that under a certain degree of accuracy
$$ \hat\mu_N = \hat\mu_{N-1} = \dots = \hat\mu_{N-p}. $$If $N$ is big then $p$ may be also big. Indeed, for $N=0,\dots,k$ we easily derive the incremental mean function as
\begin{equation}\label{eq:sgd.30} \hat\mu_N = \left(\frac{N}{N+1}\right) \hat\mu_{N-1} + \left(\frac{1}{N+1}\right)x_N. \end{equation}The last recursive relation is translated into python using the next simple code.
def incremental_mean(oldmean , newvalue , npts):
out = (npts/(npts + 1.0)) * oldmean + newvalue / (npts + 1.0)
return out
We will test this function using a very well known distribution...
# we will generate a gaussian deviates (more on this in the future) with the following parameters
data_mean = 2
data_std = 3
data_size = 5000
data = data_mean + data_std * np.random.randn(data_size)
print(data.mean())
We now use the incremental mean to compute the (online) version of mean computation.
media = 0
comprimento = data.size
cummedia = np.empty(comprimento,float)
for i in range(cummedia.size):
newvalue = data[i]
# the value of npts must be in accordance with the expression in incremental_mean
npts = i
cummedia[i] = incremental_mean(media , newvalue , npts)
media = cummedia[i]
print('The mean value of the array computed by an online algorithm is %.14f.\n' %cummedia[cummedia.size - 1])
Because we have stored the incremental results in an array we can compute the evolution of the mean.
steps = np.arange(data.size)
exact_mean = data.mean() * np.ones_like(steps)
plt.plot(steps, cummedia,label='Incremental')
plt.plot(steps , exact_mean , label = 'Exact Value')
plt.legend(('Incremental', 'Exact Value'),loc='upper right')
plt.show()
The natural question to ask is: given $\varepsilon>0$, what would be the magnitude of $x_N$ such that $|\hat\mu_N-\hat\mu_{N-1}|>\varepsilon$? Straightforward computations show that
\begin{equation}\label{eq:sgd.40} |x_N-\hat\mu_{N-1}|>(N+1)\cdot\varepsilon. \end{equation}Suppose that you computed the mean value of the array. From the observed values of the array you can forecast if it is probable that the first element that you have left will verify equation (\ref{eq:sgd.40}). For that we will compute the \textbf{Empirical Cumulative Distribution Function}.
One particular aspect that will be in the core idea of \textbf{Kolmogorov-Smirnov Test} is how the results that we have found adjust to the cumulative distribution function of the random variable that we are expecting to find.
# the code of an empirical cumulative distributioin function
def my_ecdf(in_array):
cumulation = np.cumsum(np.ones_like(in_array))/in_array.size
return cumulation
data_ecdf = my_ecdf(data)
coord_x = np.sort(data)
plt.plot(coord_x,data_ecdf)
plt.show()
Certainly, if the size of our sample gets smaller the fit tend to get worst.
data2_size = 24
data2 = data_mean + data_std * np.random.randn(data2_size)
data2_ecdf = my_ecdf(data2)
coord_x = np.sort(data2)
plt.plot(coord_x,data2_ecdf)
plt.show()
Let $S_N$ be the sum of the square errors of $x_k$ in relation to the mean ${\hat\mu}_N$,
$$%\label{eq:sgd.50} S_N=\sum_{k=0}^{N-1}(x_k-\hat\mu_N)^2. $$Straightforward computations show that $S_N$ is given by
$$\label{eq:sgd.60} S_N=\sum_{k=0}^{N-1}x_k^2 - N\cdot\hat\mu_N^2. $$This leads to the recurrent relation over $S_N$
\begin{equation}\label{eq:sgd.70} S_N - S_{N-1} = x_N^2-\hat\mu_{N-1}^2-N(\hat\mu_{N}^2-\hat\mu_{N-1}^2) \end{equation}From equations (\ref{eq:sgd.30}) and (\ref{eq:sgd.70}) we can deduce a much more simple expression for the recurrence
\begin{equation}\label{eq:sgd.80} S_N = S_{N-1} + (x_N-\hat\mu_{N-1})(x_N-\hat\mu_{N}). \end{equation}The expression (\ref{eq:sgd.70}) would be useful to compute a \textbf{stability interval} for the next value of the time series, while expression (\ref{eq:sgd.80}) the essential tool the \textit{online computation} of the variance. Because the former is more challenging, we start with the latter. Keep in mind that to compute $S_N$ we need three values: $x_N$, $\mu_{N-1}$ and $\mu_N$.
def incremental_sn (oldmu , newmu , olds , newvalue):
out = olds + (newvalue - newmu) * (newvalue - oldmu)
return out
vararray = np.zeros(len(resultados))
mu_first = resultados[0]
mu_second = incremental_mean(mu_first, resultados[1] , 1)
s_old = 0
s_new = incremental_sn(mu_first , mu_second , s_old , resultados[1])
vararray[0] = s_new
for i in range(1 , len(resultados)):
mu_second = incremental_mean(mu_first , resultados[i] , i)
s_new = incremental_sn(mu_first , mu_second , s_old , resultados[i])
vararray[i] = s_new/(i+1)
s_old = s_new
mu_first = mu_second
print("The mean of the process is %f.\n" % mu_second)
print("The comparison of the variance computed by both methods:")
print(s_new/(len(resultados)),np.var(resultados))
limit = int(vararray.size/2)
t = np.arange(vararray[0:limit].size)
plt.plot(t,vararray[0:limit])
plt.show()