There are two main files with data related to the NBA games from season 2012-13 to season 2017-18. The first, \textbf{2012-18_officialBoxScore.csv}, contains the aggregate data for each team at each game. The second, \textbf{2012-18_playerBoxScore.csv} contains the individual data of each player at each game recorded at the aggregate file. The files were downloaded from the kaggle website https://www.kaggle.com/pablote/nba-enhanced-stats/home, with minor editings. These consisted just in removing the columns of data which easily computed from other columns.
import numpy as np
import pandas as pd
import os
from shutil import move
We will split the big files into seasons and move it all to \textbf{destdirname}.
currdirname = '/Users/gmorais/MEGA/DATA/DeepLearning/aulas/week07'
# the folder to which we will copy all the files
destdirname = '/Users/gmorais/MEGA/DATA/DeepLearning/aulas/week07/files'
os.makedirs(destdirname,exist_ok=True)
tmpdf = pd.read_csv('2012-18_officialBoxScore.csv',header=0)
The original file has three rows for each team for each game. This is because it contains the names of the officials. We do not need those. So we remove the repeated rows and will delete the officials columns.
rows_to_skip = []
for i in range(1,tmpdf.shape[0]):
if (tmpdf.teamAbbr[i] == tmpdf.teamAbbr[i-1]):
rows_to_skip.append(i)
del tmpdf
newdf = pd.read_csv('2012-18_officialBoxScore.csv',header=0,skiprows=rows_to_skip)
print(newdf.shape)
colnames_to_drop =['seasTyp','offLNm','offFNm','teamRslt','opptConf','opptDiv']
newdf = newdf.drop(colnames_to_drop,axis = 1)
newdf.iloc[0:10,0:12]
Next we remove the Conference and Division columns and store them in a separated csv file \textbf{NBAteams.csv}.
nteams = 30
team_names = []
team_def = [[[] for j in range(3)]]
contador = 0
i = 0
while (contador < nteams):
if (newdf.teamAbbr[i] not in team_names) :
team_names.append(newdf.teamAbbr[i])
team_def.append([newdf.teamAbbr[i],newdf.teamConf[i],newdf.teamDiv[i]])
contador += 1
i += 1
# there is an annoying first empty element that is not filled by the append
del team_def[0]
# from this we create a dataframe that will store these values
nba_team_names_columns = ['teamAbbr','teamConf','teamDiv']
nba_team_names = pd.DataFrame(team_def, columns=nba_team_names_columns)
nbateamfilename = 'NBAteams.csv'
nba_team_names.to_csv(nbateamfilename,index=False)
# we have to give the full path name because otherwise returns error if file already exists
move(os.path.join(currdirname,nbateamfilename),os.path.join(destdirname,nbateamfilename))
nba_team_names.iloc[:,:]
newdf = newdf.drop(['teamConf','teamDiv'],axis = 1)
newdf.iloc[0:10,0:12]
We need to record the beginning of each season. We assume that there is a gap bigger than 330 days between the beginning of consecutive seasons.
game_Day = pd.Series(
list(
map(lambda x : pd.to_datetime(str(x) , format = "%d/%m/%Y") , newdf.gmDate[:])
)
)
'''
game_Time = pd.Series(
list(
map(lambda x , y : pd.to_datetime(str(x) + " " + str(y)) , newdf.gmDate[:] ,newdf.gmTime[:])
)
)
'''
seasonBeginIndex = [0]
init_Time = game_Day[0]
iterador = 0
print(init_Time)
for i in range(newdf.shape[0]):
if ((game_Day[i] - init_Time).days < 330):
iterador += 1
else :
init_Time = game_Day[iterador]
print(init_Time)
seasonBeginIndex.append(iterador)
iterador += 1
# we need to append the last game
seasonBeginIndex.append(newdf.shape[0])
DiffseasonBeginIndex = np.roll(seasonBeginIndex,-1) - seasonBeginIndex
DiffseasonBeginIndex = DiffseasonBeginIndex[:-1]
del seasonBeginIndex[len(seasonBeginIndex)-1]
print(seasonBeginIndex)
print(DiffseasonBeginIndex)
Now we are in condition to make a serial number for each game. Remember that there are two consecutive rows for the same game.
tmpgameid = np.zeros(newdf.shape[0])
season_number = 0
initial_year = 2012
for i in range(len(seasonBeginIndex)):
start_index = seasonBeginIndex[i]
for j in range(0,DiffseasonBeginIndex[i],2):
tmpgameid[start_index + j] = (initial_year + season_number) * 10000 + j/2
season_number +=1
tmpgameid = tmpgameid + np.roll(tmpgameid,+1)
idx = 0
newdf.insert(loc=idx,column = 'gameID' , value = tmpgameid.astype(int))
newdf.iloc[2450:2460,0:12]
newdf.shape
Finally, we split the big file into seasons files.
sea_names = []
sea_names_csv = []
initial_year = 2012
for i in range(len(seasonBeginIndex)):
sea_names.append('season' + str(initial_year + i) + 'df')
sea_names_csv.append('season' + str(initial_year + i) + 'df.csv')
for i in range(len(seasonBeginIndex)):
vars()[sea_names[i]] = pd.DataFrame(newdf.iloc[seasonBeginIndex[i]:seasonBeginIndex[i]+DiffseasonBeginIndex[i],:])
vars()[sea_names[i]].to_csv(sea_names_csv[i],index = False)
move(os.path.join(currdirname,sea_names_csv[i]),os.path.join(destdirname,sea_names_csv[i]))
plsdf = pd.read_csv('2012-18_playerBoxScore.csv',header=0)
plsdf.shape
Now the primary key will enter into action. To each game in the player statistics file, we will add a column with the gameID in the previous dataframe. The basic idea is that we will have a sequence of player entries (10 entries) in the same order has the games appeared in the newdf dataframe.
j = 0
i = 0
player_gameID = []
while ( i < plsdf.shape[0]):
if (newdf.teamAbbr[j] == plsdf.teamAbbr[i] and newdf.opptAbbr[j] == plsdf.opptAbbr[i]):
player_gameID.append(newdf.gameID[j])
i += 1
else :
j += 1
player_gameID = np.array(player_gameID)
# we want the 'gameID' column to be the first column
idx = 0
plsdf.insert(loc = idx , column = 'gameID' , value = player_gameID.astype(int))
As before, the date was a string. We will transform it into datetime object. This will allow us of making calculations (timedelta) with dates.
playerBDate = pd.Series(
list(
map(lambda x : pd.to_datetime(str(x)) , plsdf.playBDate[:])
)
)
We simply replace the column of the strings with the column of datetime objects and create a view of it.
idx = plsdf.columns.get_loc('playBDate')
plsdf = plsdf.drop(['playBDate'],axis = 1)
plsdf.insert(loc=idx,column = 'playBDate',value = playerBDate)
lixo1 = plsdf.columns.get_loc('playBDate')
lixo2 = plsdf.columns.get_loc('playHeight')
plsdf.iloc[0:10,[lixo1,lixo2]]
We create at this moment the second Primary Key: the playerID. This will be a sequential number that uniquely identifies a player that has ever played in the NBA in the seasons that we are considering.
colunas = ['playerID', 'playBDate' , 'playLNm' , 'playFNm','playDispNm']
playerID = pd.DataFrame(columns = colunas)
listID = []
playerSN = 100000
# plsdf.shape[0]
for i in range(plsdf.shape[0]):
if ~(playerID.playBDate.isin([plsdf.playBDate[i]]).any()
and playerID.playFNm.isin([plsdf.playFNm[i]]).any()
and playerID.playLNm.isin([plsdf.playLNm[i]]).any()):
playerID = playerID.append(
{'playerID' : playerSN ,
'playBDate' : plsdf.playBDate[i],
'playLNm' : plsdf.playLNm[i],
'playFNm' : plsdf.playFNm[i],
'playDispNm' : plsdf.playDispNm[i]
},
ignore_index=True
)
listID.append(playerSN)
playerSN += 1
plfilename = 'NBAplayersID.csv'
playerID.to_csv(plfilename, index=False)
move(os.path.join(currdirname,plfilename),os.path.join(destdirname,plfilename))
initial_year = 2012
beginSeasPlayIndex = [20120000]
j = 1
for i in range (plsdf.shape[0]):
if (plsdf.gameID[i] >= (initial_year + j)*10000):
beginSeasPlayIndex.append((initial_year + j)*10000)
j += 1
beginSeasPlayIndex.append(20180000)
beginSeasPlayIndex
plsea_names = []
plsea_names_csv = []
initial_year = 2012
for i in range(len(beginSeasPlayIndex)-1):
plsea_names.append('plseason' + str(initial_year + i) + 'df')
plsea_names_csv.append('plseason' + str(initial_year + i) + 'df.csv')
print(plsea_names)
print(plsea_names_csv)
At this moment we are in conditions to store each season in separate csv file and move it to destination folder.
for i in range(len(beginSeasPlayIndex) - 1):
vars()[plsea_names[i]] = pd.DataFrame(
plsdf.loc[
(plsdf.gameID >= beginSeasPlayIndex[i]) & (plsdf.gameID < beginSeasPlayIndex[i+1]),:]
)
vars()[plsea_names[i]].to_csv(plsea_names_csv[i],index = False)
move(os.path.join(currdirname,plsea_names_csv[i]),os.path.join(destdirname,plsea_names_csv[i]))
for i in range(len(beginSeasPlayIndex) - 1):
print(vars()[plsea_names[i]].shape)
Just a view of the head of the dataframe of the first season that we are considering.
vars()[plsea_names[0]].iloc[0:21,0:30]