Data Treatment of csv Files¶

There are two main files with data related to the NBA games from season 2012-13 to season 2017-18. The first, \textbf{2012-18_officialBoxScore.csv}, contains the aggregate data for each team at each game. The second, \textbf{2012-18_playerBoxScore.csv} contains the individual data of each player at each game recorded at the aggregate file. The files were downloaded from the kaggle website https://www.kaggle.com/pablote/nba-enhanced-stats/home, with minor editings. These consisted just in removing the columns of data which easily computed from other columns.

Treatment of Team Statistics file¶

import numpy as np
import pandas as pd

import os
from shutil import move

We will split the big files into seasons and move it all to \textbf{destdirname}.

currdirname = '/Users/gmorais/MEGA/DATA/DeepLearning/aulas/week07'
# the folder to which we will copy all the files
destdirname = '/Users/gmorais/MEGA/DATA/DeepLearning/aulas/week07/files'
os.makedirs(destdirname,exist_ok=True)

tmpdf = pd.read_csv('2012-18_officialBoxScore.csv',header=0)

The original file has three rows for each team for each game. This is because it contains the names of the officials. We do not need those. So we remove the repeated rows and will delete the officials columns.

rows_to_skip = []

for i in range(1,tmpdf.shape[0]):
    if (tmpdf.teamAbbr[i] == tmpdf.teamAbbr[i-1]):
        rows_to_skip.append(i)

del tmpdf

newdf = pd.read_csv('2012-18_officialBoxScore.csv',header=0,skiprows=rows_to_skip)

print(newdf.shape)

(14758, 74)

colnames_to_drop =['seasTyp','offLNm','offFNm','teamRslt','opptConf','opptDiv']
newdf = newdf.drop(colnames_to_drop,axis = 1)
newdf.iloc[0:10,0:12]

Next we remove the Conference and Division columns and store them in a separated csv file \textbf{NBAteams.csv}.

nteams = 30 

team_names = []
team_def = [[[] for j in range(3)]]

contador = 0 
i = 0 

while (contador < nteams):
    if (newdf.teamAbbr[i] not in team_names) :
        team_names.append(newdf.teamAbbr[i])
        team_def.append([newdf.teamAbbr[i],newdf.teamConf[i],newdf.teamDiv[i]])
        contador += 1
    i += 1
    
# there is an annoying first empty element that is not filled by the append
del team_def[0]

# from this we create a dataframe that will store these values
nba_team_names_columns = ['teamAbbr','teamConf','teamDiv']
nba_team_names = pd.DataFrame(team_def, columns=nba_team_names_columns)

nbateamfilename = 'NBAteams.csv'
nba_team_names.to_csv(nbateamfilename,index=False)
# we have to give the full path name because otherwise returns error if file already exists
move(os.path.join(currdirname,nbateamfilename),os.path.join(destdirname,nbateamfilename))
nba_team_names.iloc[:,:]

newdf = newdf.drop(['teamConf','teamDiv'],axis = 1)

newdf.iloc[0:10,0:12]

We need to record the beginning of each season. We assume that there is a gap bigger than 330 days between the beginning of consecutive seasons.

game_Day = pd.Series(
    list(
        map(lambda x : pd.to_datetime(str(x) , format = "%d/%m/%Y") , newdf.gmDate[:])
    )
)
'''
game_Time = pd.Series(
    list(
        map(lambda x , y : pd.to_datetime(str(x) + " " + str(y)) , newdf.gmDate[:] ,newdf.gmTime[:])
    )
)
'''
seasonBeginIndex = [0]
init_Time = game_Day[0]
iterador = 0 

print(init_Time)

for i in range(newdf.shape[0]):
    if ((game_Day[i] - init_Time).days < 330):
        iterador += 1 
    else :
        init_Time = game_Day[iterador]
        print(init_Time)
        seasonBeginIndex.append(iterador)
        iterador += 1

# we need to append the last game        
seasonBeginIndex.append(newdf.shape[0])

DiffseasonBeginIndex = np.roll(seasonBeginIndex,-1) - seasonBeginIndex
DiffseasonBeginIndex = DiffseasonBeginIndex[:-1]

del seasonBeginIndex[len(seasonBeginIndex)-1]

print(seasonBeginIndex)
print(DiffseasonBeginIndex)

2012-10-30 00:00:00
2013-10-29 00:00:00
2014-10-28 00:00:00
2015-10-27 00:00:00
2016-10-25 00:00:00
2017-10-17 00:00:00
[0, 2458, 4918, 7378, 9838, 12298]
[2458 2460 2460 2460 2460 2460]

Now we are in condition to make a serial number for each game. Remember that there are two consecutive rows for the same game.

tmpgameid = np.zeros(newdf.shape[0])

season_number = 0 
initial_year = 2012

for i in range(len(seasonBeginIndex)):
    start_index = seasonBeginIndex[i]
    for j in range(0,DiffseasonBeginIndex[i],2):
        tmpgameid[start_index + j] = (initial_year + season_number) * 10000 + j/2
    season_number +=1

tmpgameid = tmpgameid + np.roll(tmpgameid,+1)

idx = 0 
newdf.insert(loc=idx,column = 'gameID' , value = tmpgameid.astype(int))

Just Create some Views¶

newdf.iloc[2450:2460,0:12]

newdf.shape

(14758, 67)

Split table by Season¶

Finally, we split the big file into seasons files.

sea_names = []
sea_names_csv = []

initial_year = 2012

for i in range(len(seasonBeginIndex)):
    sea_names.append('season' + str(initial_year + i) + 'df')
    sea_names_csv.append('season' + str(initial_year + i) + 'df.csv')

for i in range(len(seasonBeginIndex)):
    vars()[sea_names[i]] = pd.DataFrame(newdf.iloc[seasonBeginIndex[i]:seasonBeginIndex[i]+DiffseasonBeginIndex[i],:])
    vars()[sea_names[i]].to_csv(sea_names_csv[i],index = False)
    move(os.path.join(currdirname,sea_names_csv[i]),os.path.join(destdirname,sea_names_csv[i]))

Treatment of the Players Statistics File¶

plsdf = pd.read_csv('2012-18_playerBoxScore.csv',header=0)

plsdf.shape

(155713, 51)

Now the primary key will enter into action. To each game in the player statistics file, we will add a column with the gameID in the previous dataframe. The basic idea is that we will have a sequence of player entries (10 entries) in the same order has the games appeared in the newdf dataframe.

j = 0
i = 0 
player_gameID = []

while ( i < plsdf.shape[0]):
    if (newdf.teamAbbr[j] == plsdf.teamAbbr[i] and newdf.opptAbbr[j] == plsdf.opptAbbr[i]):
        player_gameID.append(newdf.gameID[j])
        i += 1
    else :
        j += 1

player_gameID = np.array(player_gameID)

# we want the 'gameID' column to be the first column
idx = 0

plsdf.insert(loc = idx , column = 'gameID' , value = player_gameID.astype(int))

As before, the date was a string. We will transform it into datetime object. This will allow us of making calculations (timedelta) with dates.

playerBDate = pd.Series(
    list(
        map(lambda x : pd.to_datetime(str(x)) , plsdf.playBDate[:])
    )
)

We simply replace the column of the strings with the column of datetime objects and create a view of it.

idx = plsdf.columns.get_loc('playBDate')
plsdf = plsdf.drop(['playBDate'],axis = 1)
plsdf.insert(loc=idx,column = 'playBDate',value = playerBDate)

lixo1 = plsdf.columns.get_loc('playBDate')
lixo2 = plsdf.columns.get_loc('playHeight')
plsdf.iloc[0:10,[lixo1,lixo2]]

We create at this moment the second Primary Key: the playerID. This will be a sequential number that uniquely identifies a player that has ever played in the NBA in the seasons that we are considering.

colunas = ['playerID', 'playBDate' , 'playLNm' , 'playFNm','playDispNm']
playerID = pd.DataFrame(columns = colunas)

listID = []


playerSN = 100000

# plsdf.shape[0]

for i in range(plsdf.shape[0]):
    if ~(playerID.playBDate.isin([plsdf.playBDate[i]]).any()
        and playerID.playFNm.isin([plsdf.playFNm[i]]).any()
        and playerID.playLNm.isin([plsdf.playLNm[i]]).any()):     
        playerID = playerID.append(
            {'playerID' : playerSN , 
             'playBDate' : plsdf.playBDate[i],
             'playLNm' : plsdf.playLNm[i],
             'playFNm' : plsdf.playFNm[i],
             'playDispNm' : plsdf.playDispNm[i]
            },
            ignore_index=True
        )
        listID.append(playerSN)
        playerSN += 1

plfilename = 'NBAplayersID.csv'
playerID.to_csv(plfilename, index=False)
move(os.path.join(currdirname,plfilename),os.path.join(destdirname,plfilename))

'/Users/gmorais/MEGA/DATA/DeepLearning/aulas/week07/files/NBAplayersID.csv'

initial_year = 2012

beginSeasPlayIndex = [20120000]

j = 1

for i in range (plsdf.shape[0]):
    if (plsdf.gameID[i] >= (initial_year + j)*10000):
        beginSeasPlayIndex.append((initial_year + j)*10000)
        j += 1

beginSeasPlayIndex.append(20180000)
beginSeasPlayIndex

[20120000, 20130000, 20140000, 20150000, 20160000, 20170000, 20180000]

plsea_names = []
plsea_names_csv = []

initial_year = 2012

for i in range(len(beginSeasPlayIndex)-1):
    plsea_names.append('plseason' + str(initial_year + i) + 'df')
    plsea_names_csv.append('plseason' + str(initial_year + i) + 'df.csv')

print(plsea_names)
print(plsea_names_csv)

['plseason2012df', 'plseason2013df', 'plseason2014df', 'plseason2015df', 'plseason2016df', 'plseason2017df']
['plseason2012df.csv', 'plseason2013df.csv', 'plseason2014df.csv', 'plseason2015df.csv', 'plseason2016df.csv', 'plseason2017df.csv']

At this moment we are in conditions to store each season in separate csv file and move it to destination folder.

for i in range(len(beginSeasPlayIndex) - 1):
    vars()[plsea_names[i]] = pd.DataFrame(
        plsdf.loc[
            (plsdf.gameID >= beginSeasPlayIndex[i]) & (plsdf.gameID < beginSeasPlayIndex[i+1]),:]
    )
    vars()[plsea_names[i]].to_csv(plsea_names_csv[i],index = False)
    move(os.path.join(currdirname,plsea_names_csv[i]),os.path.join(destdirname,plsea_names_csv[i]))

for i in range(len(beginSeasPlayIndex) - 1):
    print(vars()[plsea_names[i]].shape)

(25768, 52)
(25619, 52)
(25986, 52)
(26086, 52)
(26145, 52)
(26109, 52)

Just a view of the head of the dataframe of the first season that we are considering.

vars()[plsea_names[0]].iloc[0:21,0:30]

	gmDate	gmTime	teamAbbr	teamConf	teamDiv	teamLoc	teamMin	teamPTS	teamAST	teamTO	teamSTL
0	30/10/2012	19:00	WAS	East	Southeast	Away	240	84	26	13	11
1	30/10/2012	19:00	CLE	East	Central	Home	240	94	22	21	7
2	30/10/2012	20:00	BOS	East	Atlantic	Away	240	107	24	16	4
3	30/10/2012	20:00	MIA	East	Southeast	Home	240	120	25	8	8
4	30/10/2012	22:30	DAL	West	Southwest	Away	240	99	22	12	9
5	30/10/2012	22:30	LAL	West	Pacific	Home	240	91	24	14	6
6	31/10/2012	19:00	DEN	West	Northwest	Away	240	75	19	22	9
7	31/10/2012	19:00	PHI	East	Atlantic	Home	240	84	18	16	13
8	31/10/2012	19:00	IND	East	Central	Away	240	90	22	19	3
9	31/10/2012	19:00	TOR	East	Atlantic	Home	240	88	18	10	12

	teamAbbr	teamConf	teamDiv
0	WAS	East	Southeast
1	CLE	East	Central
2	BOS	East	Atlantic
3	MIA	East	Southeast
4	DAL	West	Southwest
5	LAL	West	Pacific
6	DEN	West	Northwest
7	PHI	East	Atlantic
8	IND	East	Central
9	TOR	East	Atlantic
10	HOU	West	Southwest
11	DET	East	Central
12	SAC	West	Pacific
13	CHI	East	Central
14	SA	West	Southwest
15	NO	West	Southwest
16	UTA	West	Northwest
17	GS	West	Pacific
18	PHO	West	Pacific
19	MEM	West	Southwest
20	LAC	West	Pacific
21	POR	West	Northwest
22	OKC	West	Northwest
23	ORL	East	Southeast
24	CHA	East	Southeast
25	MIL	East	Central
26	ATL	East	Southeast
27	NY	East	Atlantic
28	MIN	West	Northwest
29	BKN	East	Atlantic

	gmDate	gmTime	teamAbbr	teamLoc	teamMin	teamPTS	teamAST	teamTO	teamSTL	teamBLK	teamPF
0	30/10/2012	19:00	WAS	Away	240	84	26	13	11	10	19
1	30/10/2012	19:00	CLE	Home	240	94	22	21	7	5	21
2	30/10/2012	20:00	BOS	Away	240	107	24	16	4	2	23
3	30/10/2012	20:00	MIA	Home	240	120	25	8	8	5	20
4	30/10/2012	22:30	DAL	Away	240	99	22	12	9	5	25
5	30/10/2012	22:30	LAL	Home	240	91	24	14	6	5	21
6	31/10/2012	19:00	DEN	Away	240	75	19	22	9	5	22
7	31/10/2012	19:00	PHI	Home	240	84	18	16	13	11	14
8	31/10/2012	19:00	IND	Away	240	90	22	19	3	10	16
9	31/10/2012	19:00	TOR	Home	240	88	18	10	12	8	18

	gameID	gmDate	gmTime	teamAbbr	teamLoc	teamMin	teamDayOff	teamPTS	teamAST	teamTO	teamSTL	teamBLK
2450	20121225	17/04/2013	20:00	WAS	Away	240	2	92	17	9	7	5
2451	20121225	17/04/2013	20:00	CHI	Home	240	2	95	26	15	5	2
2452	20121226	17/04/2013	22:30	LAC	Away	240	1	112	25	13	5	3
2453	20121226	17/04/2013	22:30	SAC	Home	240	2	108	23	11	5	4
2454	20121227	17/04/2013	22:30	HOU	Away	265	2	95	21	14	6	5
2455	20121227	17/04/2013	22:30	LAL	Home	265	3	99	27	13	6	6
2456	20121228	17/04/2013	22:30	GS	Away	240	2	99	21	12	7	3
2457	20121228	17/04/2013	22:30	POR	Home	240	1	88	18	14	2	5
2458	20130000	29/10/2013	19:00	ORL	Away	240	0	87	17	19	10	6
2459	20130000	29/10/2013	19:00	IND	Home	240	0	97	17	21	4	18

	playBDate	playHeight
0	1986-10-07	74
1	1985-06-30	80
2	1982-09-28	82
3	1993-06-28	77
4	1987-11-25	79
5	1988-10-23	76
6	1986-12-04	81
7	1990-04-24	83
8	1989-11-21	80
9	1981-08-14	84

	gameID	gmDate	gmTime	seasTyp	playLNm	playFNm	teamAbbr	teamConf	teamDiv	teamLoc	...	playMin	playPos	playHeight	playWeight	playBDate	playPTS	playAST	playTO	playSTL	playBLK
0	20120000	2012-10-30	19:00	Regular	Price	A.J.	WAS	East	Southeast	Away	...	29	PG	74	195	1986-10-07	7	6	1	0	0
1	20120000	2012-10-30	19:00	Regular	Ariza	Trevor	WAS	East	Southeast	Away	...	25	SG	80	200	1985-06-30	9	4	0	3	2
2	20120000	2012-10-30	19:00	Regular	Okafor	Emeka	WAS	East	Southeast	Away	...	25	C	82	255	1982-09-28	10	0	1	0	4
3	20120000	2012-10-30	19:00	Regular	Beal	Bradley	WAS	East	Southeast	Away	...	22	SG	77	202	1993-06-28	8	3	2	1	0
4	20120000	2012-10-30	19:00	Regular	Booker	Trevor	WAS	East	Southeast	Away	...	17	PF	79	240	1987-11-25	4	1	4	1	1
5	20120000	2012-10-30	19:00	Regular	Crawford	Jordan	WAS	East	Southeast	Away	...	29	SG	76	195	1988-10-23	11	5	1	1	1
6	20120000	2012-10-30	19:00	Regular	Webster	Martell	WAS	East	Southeast	Away	...	23	SG	81	210	1986-12-04	9	1	0	2	0
7	20120000	2012-10-30	19:00	Regular	Vesely	Jan	WAS	East	Southeast	Away	...	21	SF	83	235	1990-04-24	7	1	1	0	1
8	20120000	2012-10-30	19:00	Regular	Singleton	Chris	WAS	East	Southeast	Away	...	17	SF	80	225	1989-11-21	4	2	1	2	0
9	20120000	2012-10-30	19:00	Regular	Barron	Earl	WAS	East	Southeast	Away	...	16	PF	84	250	1981-08-14	8	0	0	1	1
10	20120000	2012-10-30	19:00	Regular	Pargo	Jannero	WAS	East	Southeast	Away	...	16	PG	73	185	1979-10-22	7	3	1	0	0
11	20120000	2012-10-30	19:00	Regular	Varejao	Anderson	CLE	East	Central	Home	...	37	C	82	230	1982-09-28	9	9	1	0	2
12	20120000	2012-10-30	19:00	Regular	Irving	Kyrie	CLE	East	Central	Home	...	35	PG	75	191	1992-03-23	29	3	4	0	1
13	20120000	2012-10-30	19:00	Regular	Gee	Alonzo	CLE	East	Central	Home	...	34	SF	78	219	1987-05-29	4	2	3	2	0
14	20120000	2012-10-30	19:00	Regular	Thompson	Tristan	CLE	East	Central	Home	...	32	SF	80	225	1991-03-13	12	5	2	1	0
15	20120000	2012-10-30	19:00	Regular	Waiters	Dion	CLE	East	Central	Home	...	28	SG	76	221	1991-12-10	17	0	3	3	0
16	20120000	2012-10-30	19:00	Regular	Miles	Calvin	CLE	East	Central	Home	...	18	SF	78	210	1987-04-18	2	1	3	0	0
17	20120000	2012-10-30	19:00	Regular	Gibson	Daniel	CLE	East	Central	Home	...	16	PG	74	200	1986-02-27	10	1	0	0	1
18	20120000	2012-10-30	19:00	Regular	Zeller	Tyler	CLE	East	Central	Home	...	15	C	84	250	1990-01-17	5	0	0	1	1
19	20120000	2012-10-30	19:00	Regular	Sloan	Donald	CLE	East	Central	Home	...	13	PG	75	205	1988-01-15	4	1	2	0	0
20	20120000	2012-10-30	19:00	Regular	Walton	Luke	CLE	East	Central	Home	...	12	SF	80	235	1980-03-28	2	0	2	0	0