Unsupervised Clustering Analysis of NBA Players

Published:

Overview

This technical report was created for our Data Mining and Wrangling class in AIM MSDS. In particular, this was done during our 2nd semester of class, as one of the required lab reports. In this report, we sought to understand how the landscape of the NBA has changed over the decades, and specifically if we are able to generalize certain player stereotypes throughout the years. We analyze these stereotypes, as well as the changes among them, using Unsupervised Clustering and apply Principal Component Analysis to extract meaningful features from the data. At the end, we also take a look at the evolution of 3-point shooters and the dramatic change that the 3-point shot has introduced to the NBA gameplay (as part of my personal interest, mostly).

Acknowledgements

This analysis was done together with my Lab partner, Lance Aven Sy.

Imports and Functions

import sqlite3
import numpy as np
import pandas as pd
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import seaborn as sns
import re

plt.style.use('https://gist.githubusercontent.com/lpsy/e81ff2c0decddc9c6df'
              'eb2fcffe37708/raw/lsy_personal.mplstyle')
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from sklearn.metrics import calinski_harabaz_score, silhouette_score
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from collections import Counter

from warnings import simplefilter

simplefilter('ignore')
from collections import Counter
from wordcloud import WordCloud
def cluster_range(X, clusterer, k_stop, actual=None):
    """Return a dictionary of cluster labels, internal validation values
    and, if actual labels is given, external validation values for every k
    starting from k = 2

    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
        Does not accept sparse matrices
    clusterer : array
        sklearn.cluster object
    k_stop : integer
        ending number of clusters
    actual : array, optional
        cluster labels

    Returns
    -------
    out : dict
    """
    out = {'chs': [], 'iidrs': [], 'inertias': [], 'scs': [], 'ys': []}
    if isinstance(actual, np.ndarray):
        out['amis'] = []
        out['ars'] = []
        out['ps'] = []

    for k in range(2, k_stop+1):
        clusterer.n_clusters = k

        y = clusterer.fit_predict(X)
        out['ys'].append(y)

        # Calinski-Harabasz index
        out['chs'].append(calinski_harabaz_score(X, y))

        # Intra/Inter cluster distance ratio
        out['iidrs'].append(intra_to_inter(X, y, euclidean, 50))

        # inertias
        out['inertias'].append(clusterer.inertia_)

        # Silhouette score
        out['scs'].append(silhouette_score(X, y))

        if isinstance(actual, np.ndarray):
            # Adjusted mutual information
            out['amis'].append(adjusted_mutual_info_score(
                actual, y, average_method='arithmetic'))

            # Adjusted Rand Index
            out['ars'].append(adjusted_rand_score(actual, y))

           # Cluster purity
            out['ps'].append(purity(actual, y))
    return out
def plot_clusters(tsne_df, ys):
    n = len(ys)
    rows = int(round(np.sqrt(n)))
    cols = int(round(n/rows))

    if cols > rows:
        cols, rows = rows, cols

    fig, axes = plt.subplots(rows, cols, dpi=150, figsize=(rows*5+1,cols*4+1))
    
    for i, ax in enumerate(fig.axes):
        if i >= n:
            fig.delaxes(ax)
            continue
        ax.scatter(x='x', y='y', c=ys[i], alpha=0.8, data=tsne_df)
        ax.set_title(f'{i+2} clusters')
    
    return fig
def intra_to_inter(X, y, dist, r):
    """Compute intracluster to intercluster distance ratio
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    y : array
        Class label of each point
    dist : callable
        Distance between two points. It should accept two arrays, each 
        corresponding to the coordinates of each point
    r : integer
        Number of pairs to sample
        
    Returns
    -------
    ratio : float
        Intracluster to intercluster distance ratio
    """
    p = []
    q = []
    np.random.seed(11)
    for i, j in np.random.randint(low=0, high=len(y), size=(r, 2)): 
        if i == j:
            continue
        elif (y[i] == y[j]):
            p.append(dist(X[i],X[j]))
        else:
            q.append(dist(X[i],X[j]))
    return (np.asarray(p).mean())/(np.asarray(q).mean())
def plot_internal(inertias, chs, iidrs, scs):
    """Plot internal validation values"""
    
    colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
    fig, ax = plt.subplots(2,2, figsize=(16,9))
    
    ks = np.arange(2, len(inertias)+2)
    ax[0][0].plot(ks, inertias, '-o', label='SSE', c=colors[0])
    ax[0][0].set_xlabel('$k$')
    ax[0][0].set_ylabel('SSE')
    ax[0][0].set_title('SSE')

    ax[1][0].plot(ks, chs, '-o', label='CH', c=colors[1])
    ax[1][0].set_xlabel('$k$')
    ax[1][0].set_ylabel('CH')
    ax[1][0].set_title('CH')
    ax[1][0]._get_lines.get_next_color()
    
    ax[0][1].plot(ks, iidrs, '-o', label='Inter-intra', c=colors[2])
    ax[0][1].set_xlabel('$k$')
    ax[0][1].set_ylabel('Inter-Intra') 
    ax[0][1].set_title('Inter-Intra')
    ax[0][1]._get_lines.get_next_color()
    
    ax[1][1].plot(ks, scs, '-o', label='Silhouette coefficient', c=colors[3])
    ax[1][1].set_xlabel('$k$')
    ax[1][1].set_ylabel('Silhouette') 
    ax[1][1].set_title('Silhouette')
    ax[1][1]._get_lines.get_next_color()
    
    for axs in fig.get_axes():
        for axis in [axs.xaxis, axs.yaxis]:
            axis.set_major_locator(ticker.MaxNLocator(integer=True))   
    
    plt.tight_layout()
    return fig

Exploratory Data Analysis (EDA)

# load season averages
eda = pd.read_sql('''SELECT * FROM season_average''', conn)

# drop all empty or None rows
eda.dropna(how='any', inplace=True)

# remove all non-numeric data
eda = eda[~eda['G'].str.contains('Did')]

# convert all numeric columns to float
eda[eda.columns[6:-1]] = eda[eda.columns[6:-1]].astype(float)

# drop index columns
eda.drop('index', axis=1, inplace=True)

One of the most interesting developments in the NBA’s recent history is the growing prevalence of 3 point shots. This was first popularized by the Steve Nash-led Phoenix Suns of 2004-2006, with their run-and-gun style offense under coach Mike D’Antoni. However, during the time, this was seen as more of a fad as the Phoenix Suns were never able to move past the Western Conference Finals and thus were not able to gain mainstream success. In today’s NBA game, teams have utilized the 3 point shot to great effect; this is seen most in the Golden State Warriors who have won 3 of the last 5 championships behind their “Splash Brothers”, Klay Thompson and 2-time MVP Steph Curry, and the Houston Rockets, who hold the record for the most 3 point attempts per game of a team in NBA history. With this in mind, let’s take a look at the past 20 years worth of 3 point attempts and 3 point shooting percentage to see the growth of both volume and accuracy of the 3 point shot.

seasons = eda.groupby('Season')[['3PA', 'PTS', '3P%']].mean().reset_index()

fig, ax = plt.subplots(figsize=(16,8), dpi=200)
ax.plot(seasons['Season'], seasons['3P%'], color='k', label='3 Point Percentage')
ax2 = ax.twinx()
ax2.plot(seasons['Season'], seasons['3PA'], color='green', label='3 Point Attempts')
ax.tick_params('x', labelrotation=75)
ax.legend()
ax2.legend(loc='upper left')
ax.axvline(0, color='green', ls='--', alpha=0.5)
ax.axvline(14, color='red', ls='--', alpha=0.5)
ax.axvline(17, color='red', ls='--', alpha=0.5)
ax.axvline(35, color='blue', ls='--', alpha=0.5)
ax.set_title('3 Point Attempts and Percentage 1979-2019');1

png

In the plot above, we can see that during the introduction of the 3 point line 1979-80 Season, 3 point accuracy was very high but this was limited to a very small sample size. The red dotted line during the 1994-95 and 1996-97 seasons indicate the 3-year period wherein the NBA shortened the 3 point line in order to increase volume and usage of the 3 pointer in the NBA game. Lastly, we can see the rapid increase in both the attempts and accuracy of the 3 pointer during the 2014-15 season onwards. This is marked by the blue dotted line that indicates the year that Steph Curry won his first MVP season and the Golden State Warriors dominated the NBA to win their first championship in 50 years. This is a turning point in the 3 point arena of the game, as most teams in the current NBA cannot survive without a good 3 point shooter, and this is reflected in the marked increase in both volume and accuracy of 3 point shooters in the league since then.

Load Data

We proceed with clustering the NBA players.

# connect to sqlite db
conn = sqlite3.connect('Lab_Lab 5_nbaDB.db')
df = pd.read_sql('SELECT * FROM season_average', conn, index_col='index')
df.head()
SeasonAgeTmLgPosGGSMPFGFGA...ORBDRBTRBASTSTLBLKTOVPFPTSPlayer
index
01990-9122.0PORNBAPF43.00.06.71.32.7...0.61.42.10.30.10.30.50.93.1Alaa Abdelnaby
11991-9223.0PORNBAPF71.01.013.22.55.1...1.12.53.70.40.40.20.91.96.1Alaa Abdelnaby
21992-9324.0TOTNBAPF75.052.017.53.36.3...1.72.84.50.40.30.31.32.57.7Alaa Abdelnaby
31992-9324.0MILNBAPF12.00.013.32.24.7...1.02.13.10.80.50.31.12.05.3Alaa Abdelnaby
41992-9324.0BOSNBAPF63.052.018.33.56.6...1.83.04.80.30.30.31.32.68.2Alaa Abdelnaby

5 rows × 31 columns

As the collected data for seasons are written in the NBA format of yyyy-yy, we create a function named get_year in order to create a new column that contains the ending year of each season i.e., 2008-09 will become 2009.

def get_year(x):
    year = re.search('\d+$', x).group(0)
    year = int(year)
    
    if year < 20:
        year += 2000
    else:
        year += 1900
        
    return year

As we are interested in seeing the progression of the NBA players for the last 10 years (2009-2019), we filter our dataframe to exclude all the years before the 2009 season. The resulting dataframe contains null values and non-numeric values i.e., “Did not play”, which we remove by dropping these rows. Lastly, we slice our final dataframe columns to include only the columns that are not correlated as shown in the heatmap discussed previously.

df['year'] = df.Season.apply(get_year)
df2 = df[df.year>=2009].drop('Season', axis=1)
df2.shape
(7092, 31)
df3 = df2.drop_duplicates(subset=['Player', 'year'])
df3 = df3.sort_values(['Player', 'year']).reset_index(drop=True)
print(df3.shape)
df3.head(2)
(5550, 31)
AgeTmLgPosGGSMPFGFGAFG%...DRBTRBASTSTLBLKTOVPFPTSPlayeryear
024.0DALNBAC2207.40.81.90.405...1.31.60.20.00.60.51.02.2A.J. Hammons2017
123.0INDNBAPG56.02.015.42.66.30.41...1.41.61.90.60.11.10.97.3A.J. Price2010

2 rows × 31 columns

df3.columns
['Age', 'Pos', 'GS', 'MP', 'FG', 'TOV', 'PTS']
Index(['Age', 'Tm', 'Lg', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Player',
       'year'],
      dtype='object')
df4 = df3[df3['G'].apply(lambda x: x.replace('.', '').isnumeric())]
df4 = df4[['Player', 'year', 'G', '3P', '3P%', '2P', '2P%', 'FT%',
                'eFG%', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'PF']]

df4.dropna(inplace=True)
print(df4.shape)
df4.head(2)
(4527, 15)
PlayeryearG3P3P%2P2P%FT%eFG%ORBDRBASTSTLBLKPF
0A.J. Hammons2017220.20.50.50.3750.450.4640.41.30.20.00.61.0
1A.J. Price201056.01.10.3451.50.4720.80.4940.21.41.90.60.10.9
df4['Player_Unique'] = df4.Player + '_' + df4.year.astype(str)
df4.loc[:, 'G':'PF'] = df4.loc[:,'G':'PF'].astype(np.float64)
print(df4.shape)
df4.head(2)
(4527, 16)
PlayeryearG3P3P%2P2P%FT%eFG%ORBDRBASTSTLBLKPFPlayer_Unique
0A.J. Hammons201722.00.20.5000.50.3750.450.4640.41.30.20.00.61.0A.J. Hammons_2017
1A.J. Price201056.01.10.3451.50.4720.800.4940.21.41.90.60.10.9A.J. Price_2010

In the final dataframe, we created a new column that concatenates the player names with the season played, as there will be multiple years per player in the dataset. We are left with the following columns:

  • G - Number of games played
  • 3P - Number of 3 point shots made
  • 3P% - Percentage of 3 point shots made
  • 2P - Number of 2 point shots made
  • 2P% - Percentage of 2 point shots made
  • FT% - Percentage of free throws made
  • eFG% - Effective field goal percentage ((FGM + 0.5 * 3PM) / FGA)
  • ORB - Number of Offensive Rebounds
  • DRB - Number of Defensive Rebounds
  • AST - Number of assists
  • STL - Number of steals
  • BLK - Number of blocks
  • PF - Number of personal fouls

Total number of features: 13.

Dimensionality Reduction (PCA)

As we are still left with 13 features, we perform dimensionality reduction using Principal Component Analysis. Using PCA, we transform the dataset on to the vectors that have the most explained variance. As such, each principal component in the analysis is not representative of a feature, but rather, a combination of all features that are weighted. In doing so, we are able to reduce the number of “features” that we will be using for the analysis, thereby reducing the amount of noise in the model.

However, before proceeding to Principal Component Analysis, we must first ensure that the dataset is mean-centered at 0 as this is one of the pre-requisites of PCA. We can achieve this by using the StandardScaler function from sklearn.preprocessing. This function scales all the features in order to have a new mean centered at 0.

X = df4.iloc[:,2:-1]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Once we have scaled the dataset, we take a look at the directions of each of the vectors that represent a feature in our data. This will give us a better idea in terms of the directionality or relationship of each feature with each other. In the plot below, we show the direction of each feature vector (red arrow) on the plot of the first 2 principal components.

V = np.cov(X_scaled, rowvar=False)
lambdas, w = np.linalg.eig(V)
indices = np.argsort(lambdas)[::-1]
lambdas = lambdas[indices]
w = w[:, indices]
new_X = np.dot(X_scaled, w)
fig, ax = plt.subplots(dpi=120)
ax.scatter(new_X[:,0], new_X[:,1])
for feat, feat_name in zip(w, df4.columns[2:-1]):
    ax.arrow(0, 0, 7*feat[0], 7*feat[1], color='r', width=0.1, ec='none')
    ax.text(7*feat[0], 7*feat[1], feat_name, ha='center', color='k')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Plot of Eigenvectors against first 2 PCs');

png

In the plot of the feature vectors above, we can see that there are relationships among the feature vectors that are of interest in the context of an NBA game:

  • The 3 point shooting vector (as indicated by 3P and 3P%) is almost opposite to the feature vector for offensive rebounds and blocks (as indicated by ORB and BLK). In the context of an NBA game, this makes sense as a player who shoots three pointers would most likely be positioned outside of the 3 point line, thus giving him a disadvantage on offensive rebounds due to the distance from the basket. This accounts for the negative relationship between these two vectors.
  • In the same vein, 2 point shooting is correlated with defensive rebounds, personal fouls, and effective field goal percentage (as indicated by DRB, PF, and eFG%, respectively). This is most likely due to the “inside play” during NBA games, and can be seen the most among centers and power forwards as these positions are typically played near the paint. This proximity to the rim accounts for their high effective field goal percentage, propensity for defensive rebounds, and their prevalence of 2 point shots. As these players are also the ones most likely to be inside the paint, they are correlated with personal fouls as well as they are tasked with defending the rim from opponents, thus running straight into the line of fire of driving opponents and increasing their probability of committing a foul.
  • An interesting observation is that that in the traditional positions of the NBA, we can see a separation along the vector of eFG% wherein all the vectors above it (2P, PF, DRB, 2P%, ORB, and BLK) are traditionally attributes of Centers and Forwards, or the “inside” players. Whereas the vectors opposite of this group (G, STL, AST, FT%, 3P, 3P%) are traditionally attributed more toward guards who are the “outside” players and ball handlers of the game.

A main advantage of PCA is being able to limit the number of features that we use to describe the dataset. To achieve this, we will need to limit the original features by setting a target percentage (%) of explained variance that we want to retain using the lowest number of principal components. The functions below convert our scaled numpy array into the principal components and remove those principal components that are beyond our target explained variance of 80%. This leaves us with 5 principal components as can be seen in the cumulative variance explained plot below.

def get_min_pcs(X, var):
    colors = plt.rcParams["axes.prop_cycle"].by_key()["color"]
    pca = PCA(svd_solver='full')
    new_X2 = pca.fit_transform(X)
    
    var_explained = pca.explained_variance_ratio_
    
    fig, ax = plt.subplots(1, 2, figsize=(16,6))
    ax[0].plot(np.arange(1, len(var_explained)+1), var_explained, c=colors[0])
    ax[0].set_xlabel('PC')
    ax[0].set_ylabel('variance explained')
    
    cum_var_explained = var_explained.cumsum()
    ax[1].plot(np.arange(1, len(cum_var_explained)+1),
                  cum_var_explained, '-o', c=colors[1])
    ax[1].set_ylim(bottom=0)
    ax[1].set_xlabel('PC')
    ax[1].set_ylabel('cumulative variance explained');
    
    return new_X2, np.searchsorted(cum_var_explained, var) + 1

def project(X_rotated, min_pcs):
    pca = PCA(n_components=min_pcs, svd_solver='full')
    X_new = pca.fit_transform(X_rotated)
    return X_new
X_rotated, min_pcs = get_min_pcs(X_scaled, 0.8)
X_new = project(X_rotated, min_pcs)

png

Clustering Model (K-Means)

Once we have reduced the dimensionality of the data, we can proceed with the clustering. The method for clustering chosen for this data is KMeans clustering. This works by assigning a random “mean point” in the data and adjusts each point by getting the closest points to the initally assigned mean point. The algorithm iterates this through multiple cycles, adjusting the mean point of each formed cluster until there are no more changes in the assigned mean point of each cluster.

A prerequisite of the KMeans algorithm is that we will need to provide a number of clusters to use. In order to select the optimal number of clusters, we run an iteration of KMeans for k=2 until k=16 and plot the various internal validation criteria:

  • SSE(Sum of Squares Error)
  • Inter-Intra Cluster Range
  • Calinski-Harabasz
  • Silhouette Coefficient
kmeans_nba = KMeans(random_state=1337)
out = cluster_range(X_new, kmeans_nba, 16, actual=None)
plot_internal(out['inertias'], out['chs'], out['iidrs'], out['scs']);

png

The optimal number of k chosen is 4. This number is at the elbow point of the SSE and Inter-Intra cluster range, as well as retaining a high CH score, and Silhouette Coefficient. We proceed with clustering the data using KMeans with an optimal number of 4 clusters as its hyperparameter.

# number of clusters
clusters = 4
y_predicted = out['ys'][clusters-2]

df4['y_predicted'] = y_predicted
len_clusters = []
for n in set(y_predicted):
    c = df4.loc[y_predicted==n]
    cluster_count = len(c)
    len_clusters.append(cluster_count)
fig, ax = plt.subplots()
ax.bar(Counter(y_predicted).keys(), Counter(y_predicted).values())
ax.set_ylabel('Number of Players')
ax.set_xlabel('Clusters')
ax.set_title('Number of Players per Cluster (k=4)');

png

X_players_new = TSNE(n_components=2,random_state=1337).fit_transform(X_new)
fig, ax = plt.subplots()
ax.scatter(X_players_new[:,0], X_players_new[:,1], c=list(y_predicted), 
           alpha=0.5)
ax.set_title('TSNE Projection of Clusters');

png

player_clusters = []
for i in range(clusters):
    grp = df4[df4.y_predicted==i]['Player'].to_list()
    player_clusters.append(dict(Counter(grp).most_common()))

Analysis

Once we have clustered our player data, we first take a look at the player composition of each cluster. To get an overview of the players per cluster, we plot the names of the players in a word cloud.

fig, axs = plt.subplots(2,2, figsize=(16,9))
for i, ax in enumerate(fig.axes):
    freq = player_clusters[i]
    wordcloud_obj = WordCloud(background_color="white",
                              mask=None,
                              contour_width=1,
                              contour_color='white',
                              random_state=2018)
    wordcloud = wordcloud_obj.generate_from_frequencies(frequencies=freq)

    # Display the generated image
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")

png

From the cursory view given by the wordclouds, we can immediately see two distinct clusters: star players and centers. These are the clusters on the right hand side as denoted by the star cluster of Chris Paul, Dwyane Wade, Kevin Durant, and Lebron James, and the cluster of centers/bigs as denoted by LaMarcus Aldridge, Brook Lopez, Dwight Howard, Pau Gasol, and others. The other two clusters seem to be a mix of guards and forwards that are perhaps clustered based on their skill level as the cluster with Jared Dudley, J.J. Barea, C.J. Miles and others are known bench players or 6th man players for different teams. To get a clearer picture of the description per cluster, we can take a look at their average stats per player.

df5 = df4.merge(df3[['Player', 'year', 'Age', 'Pos', 
                    'GS', 'MP', 'FG', 'TOV', 'PTS']], on=['Player', 'year'])
df5[df5.columns[-5:]] = df5[df5.columns[-5:]].astype(float)
df5.groupby('y_predicted')[df5.columns[2:]].mean().transpose()
y_predicted0123
G29.05660469.08018457.28481767.415716
3P0.2682571.4694010.8317920.185044
3P%0.2165180.3551610.3484820.163632
2P0.9475034.0798161.7344184.375792
2P%0.4044380.4801690.4848900.527292
FT%0.6702250.8009560.7674630.693532
eFG%0.4024850.5021910.5077170.522745
ORB0.4196450.8087560.5739162.180608
DRB1.2357383.5447002.0691214.950824
AST0.9501664.1858991.4610161.582890
STL0.3513871.1710600.5511420.730545
BLK0.1587130.3735480.2580481.012928
PF1.0836852.2012901.6005142.556907
y_predicted0.0000001.0000002.0000003.000000
Age25.72697026.66359426.76940626.250951
GS2.59378555.86082914.94748945.475285
MP11.11720331.57096818.80040026.051838
FG1.2146505.5454382.5670094.561090
TOV0.6198672.0917050.8991441.483777
PTS3.25871315.3988946.96678111.452091
positions = []
for i in set(y_predicted):
    positions.append(dict(Counter(df5[df5['y_predicted'] == i]['Pos'])))
pd.DataFrame.from_records(positions).fillna(0)[['C', 'PF', 'SF', 'SG', 'PG']]
CPFSFSGPG
078151189235226
117111224305411
2120302449517330
34253034473

By looking at the average stats per cluster, we are able to ascertain different insights from each cluster. As mentioned previously, there is the presence of the star cluster and the centers cluster. Based on these stats, we can describe each cluster as such:

  • Cluster 0 - bench players. These are players with low overall stats, the most telling of which is the number of games played in the cluster, being just about half the number of games played compared to the other clusters. This also reflects in the amount of two pointers and three pointers that they’ve made as well as the comparatively low numbers they have in the other measures. This cluster may also be the rookies or new players in the league as they have the lowest average age of all clusters.
  • Cluster 1 - star players. These are the players who play the most minutes, indicating that they are the go-to guys of the team. This cluster also has the highest average stats in almost all categories, with a highlight being the number of games started, minute played, points, assists, and turnovers. The turnover rate in this cluster may be related to the number of minutes played, as well as the fact that star players usually have the ball in their hands the most while facing the stiffest defenses. The age of the players in this cluster is also telling as the average age is at around 26.66 years old, and the accepted “prime years” of an NBA player would coincide with this age range of about 26-30 years old. This cluster is predominantly made up of small forwards, shooting guards, and point guards, which coincide with the highest volume shooters in the league and make up the high scoring rate and shooting accuracy of this cluster.
  • Cluster 2 - role players. These players form the role players in the team. While their numbers are better than those of cluster 0, there is a marked difference between them and the “elite” players in the league. Their numbers are generally lower than those of cluster 1 and cluster 3. Going back to the word cloud, we can see that the names of these players are composed mostly of bench players and starters who are role players, generally these are the players that you build around star players.
  • Cluster 3 - big men. This cluster is predominantly composed of centers or big men as evidenced by the number of cneters and power forwards in the positions. As each team needs to start a center in the game, this role is usually very well defined in their stats such as high amount of rebounds and blocks, as well as a higher number of minutes played and highest personal fouls, as they are tasked with defending the rim from the driving opponents, making them more prone to fouling. As we saw in the feature vectors of the principal components above, this is a very well defined cluster as the feature vectors relating to big men/centers are all related or pointing in the same direction. As we expect from the PCA plot, this cluster is defined by the number of rebounds, personal fouls, and 2 pointers and 2 point accuracy.

Based on the clusters that were formed, the unsupervised KMeans algorithm seemed to cluster the players based on their skill set and skill level. Whereas we had truly elite guards/forwards in cluster 1, we also had the newbies or bench players in cluster 0. We also found that there is a concenctration of big men in cluster 3, possibly because of the strength of the feature vectors along this principal component. Thus, we deem that the clustering algorithm was able to cluster players based on both their skill set (usually related to their position or what they are able to bring to the game), and the level at which they execute at this skill set (given by the average stats for each cluster).

One of the insights we can garner from this analysis is that there is a chance that there are players that can be found in multiple clusters throughout their careers. This could be the player trajectory of starting out as a role player or bench player and eventually moving into the role of a star player during their prime years. Another possible career trajectory for this time period is that of a player coming down from their peak and being relegated into the role of a bench player or supporting player. In our clustering, the first scenario would indicate a move for players from cluster 2 into cluster 1, and the second scenario would involve a move from cluster 1 to cluster 2 or possibly cluster 0. In the past 10 years, there would be several players who fall into these categories: Klay Thompson was drafted in 2011 and played a minor role in the Golden State Warriors until their breakthrough season in 2014-2015 where he became a star player, Draymond Green similarly was drafted by GSW in 2012 and played a minor role up until the same season, and Kawhi Leonard was drafted in 2011 and played a minor role until the San Antonio Spurs won the championship in 2014 with which he won the Finals MVP award. We expect these players’ career trajectories to place them in multiple clusters in our analysis.

players0 = set(df4[df4['y_predicted']==0]['Player'].to_list())
players1 = set(df4[df4['y_predicted']==1]['Player'].to_list())
players2 = set(df4[df4['y_predicted']==2]['Player'].to_list())
players3 = set(df4[df4['y_predicted']==3]['Player'].to_list())

player_analysis = ['Klay Thompson', 'Draymond Green', 'Kawhi Leonard']
compiler = []
for j in [players0, players1, players2, players3]:
    player_dict = dict(zip(player_analysis, [""] * 3))
    for i in player_analysis:
        player_dict[i] = i in j
    compiler.append(player_dict)
pd.DataFrame.from_records(compiler, columns=player_analysis)
Klay ThompsonDraymond GreenKawhi Leonard
0FalseTrueFalse
1TrueTrueTrue
2TrueFalseTrue
3FalseFalseFalse

From the table above, we can see that all three of these players had different years in their career. Kawhi Leonard and Klay Thompson, both first round draft picks (15th and 11th, respectively) were most likely clustered first in cluster 2 as role players during the first half of their career, whereas Draymond Green, a second round draft pick, was first in the cluster 0 as a bench player. However, during their latter years, the three of them developed into full-fledged star players and being clustered into cluster 1 as star players for their team. We validate this by looking into the years in which each player were clustered into clusters 0, 1, and 2.

for i in ['Klay Thompson', 'Kawhi Leonard']:
    print(f'{i}:')
    print(f'Cluster 2: {df4[(df4["y_predicted"]==2) & (df4["Player"]==i)]["Player_Unique"].to_list()}')
    print(f'Cluster 1: {df4[(df4["y_predicted"]==1) & (df4["Player"]==i)]["Player_Unique"].to_list()}')
    print('\n')

d = 'Draymond Green'
print(f'{d}:')
print(f'Cluster 0: {df4[(df4["y_predicted"]==0) & (df4["Player"]==d)]["Player_Unique"].to_list()}')
print(f'Cluster 1: {df4[(df4["y_predicted"]==1) & (df4["Player"]==d)]["Player_Unique"].to_list()}')
print('\n')
Klay Thompson:
Cluster 2: ['Klay Thompson_2012']
Cluster 1: ['Klay Thompson_2013', 'Klay Thompson_2014', 'Klay Thompson_2015', 'Klay Thompson_2016', 'Klay Thompson_2017', 'Klay Thompson_2018', 'Klay Thompson_2019']


Kawhi Leonard:
Cluster 2: ['Kawhi Leonard_2012']
Cluster 1: ['Kawhi Leonard_2013', 'Kawhi Leonard_2014', 'Kawhi Leonard_2015', 'Kawhi Leonard_2016', 'Kawhi Leonard_2017', 'Kawhi Leonard_2018', 'Kawhi Leonard_2019']


Draymond Green:
Cluster 0: ['Draymond Green_2013']
Cluster 1: ['Draymond Green_2014', 'Draymond Green_2015', 'Draymond Green_2016', 'Draymond Green_2017', 'Draymond Green_2018', 'Draymond Green_2019']

From the list above, we can see that the career trajectories of Klay Thompson and Kawhi Leonard mirror each other down to the year that they entered and their rookie years serving as a role player before transitioning into elite status on their second year. Similarly, Draymond Green spent his first year as a bench player before breaking out as an elite star player in his second year.

2019 NBA Players

After clustering the NBA players for the 10 year period between 2009-2019, we take a look at the most recent NBA season and cluster the players for this year to validate whether or not these clusters have changed.

df2019 = df[df['year'] == 2019]
df2019.drop_duplicates(['Player', 'year'], keep='first', inplace=True)
df2019 = df2019.reset_index()
df2019_players = df2019['Player']
df2019 = df2019[['Player', 'year', 'G', '3P', '3P%', '2P', '2P%', 'FT%',
                'eFG%', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'PF']]
df2019 = df2019[~df2019['G'].str.contains('Did')]
df2019.dropna(how='any', inplace=True)
df2019[df2019['Player'].str.contains('Ray')]
PlayeryearG3P3P%2P2P%FT%eFG%ORBDRBASTSTLBLKPF
164Raymond Felton201933.00.60.3281.10.4730.9230.4810.10.91.60.30.20.9
458Ray Spalding201914.00.00.01.80.5680.3330.5321.12.40.40.60.61.6

PCA 2019

X2019_scaled = scaler.fit_transform(df2019[df2019.columns[2:]])
X2019_rotated, min_pcs2019 = get_min_pcs(X2019_scaled, 0.8)
X2019_new = project(X2019_rotated, min_pcs2019)

png

In clustering the 2019 stats, we do dimensionality reduction through Principal Component Analysis in order to get the nuumber of PCs that will explain 80% of our total explained variance. This is similar to the one for the aggregated 10 year player cluster as we will be using 5 principal components.

KMeans 2019

kmeans_nba2019 = KMeans(random_state=1337)
out = cluster_range(X2019_new, kmeans_nba2019, 16, actual=None)
plot_internal(out['inertias'], out['chs'], out['iidrs'], out['scs']);

png

# number of clusters
clusters = 4
y_predicted = out['ys'][clusters-2]

df2019['y_predicted'] = y_predicted
len_clusters = []
for n in set(y_predicted):
    c = df2019.loc[y_predicted==n]
    cluster_count = len(c)
    len_clusters.append(cluster_count)
len_clusters
[146, 210, 56, 63]
player_clusters = []
for i in range(clusters):
    grp = df2019[df2019.y_predicted==i]['Player'].to_list()
    player_clusters.append(dict(Counter(grp).most_common()))

Analysis 2019

fig, axs = plt.subplots(2,2, figsize=(16,9), dpi=300)
for i, ax in enumerate(fig.axes):
    freq = player_clusters[i]
    wordcloud_obj = WordCloud(background_color="white",
                              mask=None,
                              contour_width=1,
                              contour_color='white',
                              random_state=2018)
    wordcloud = wordcloud_obj.generate_from_frequencies(frequencies=freq)

    # Display the generated image
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")

png

df2019[df2019.columns[2:-1]] = df2019[df2019.columns[2:-1]].astype(float)
grouped = df2019.merge(df3[['Player', 'year', 'Age', 'Pos', 
                    'GS', 'MP', 'FG', 'TOV', 'PTS']], on=['Player', 'year'])
grouped[grouped.columns[-5:]] = grouped[grouped.columns[-5:]].astype(float)
grouped.groupby('y_predicted')[grouped.columns[2:]].mean().transpose()
y_predicted0123
G67.52739748.11904866.78571420.619048
3P1.7342470.7361900.5321430.341270
3P%0.3604450.3143290.2661790.223016
2P3.5842471.4942864.8107140.734921
2P%0.4941030.5209290.5758210.407683
FT%0.7930960.7292050.7118040.674730
eFG%0.5144110.5226330.5619290.388841
ORB0.7897260.6161902.3125000.296825
DRB3.7041102.0733335.6875001.112698
AST3.6904111.2466672.2214290.953968
STL1.0212330.4471430.8410710.304762
BLK0.3952050.2814291.1017860.112698
PF2.2034251.5471432.7017860.936508
y_predicted0.0000001.0000002.0000003.000000
Age26.49315126.21428625.39285724.603175
GS50.0205488.00476249.1250001.301587
MP29.27191815.93381026.5446439.588889
FG5.3157532.2323815.3410711.073016
TOV1.8068490.7219051.6089290.509524
PTS14.7335626.01381013.6857142.942857

When we cluster all the players for just one year, we can see that there are roughly the same clusters as in the aggregate clustering. There are still clusters for the following:

  • Big men/Defensive - Cluster 2. These are players that lead all clusters in rebounding and shot blocking as was in the aggregate clustering.
  • Elite/Stars - Cluster 0. These are the star players who play the most minutes and score the most points.
  • Bench Players - Cluster 3. These are the players with the lowest games played and overall stats. Similar to the aggregate clusters, these are developing players or rookies, as evident with their low average age.
  • Role Players - Cluster 1. These players are the role players for every team. Similar again to the aggregate clustering, we see that they are relatively balanced in their stats and can contribute in many ways.

Clustering 3 Point Shooters

As we’ve seen in the EDA section, there has been a growing trend of 3 point shooting in the league, led by the teams of Steph Curry and James Harden. In this section, we look to cluster the 3 point shooters in the league to see what differentiates them from one another. In the selection of players to cluster, we limited these to the players who have attempted at least 200 3 point shots throughout the course of the season, which is the basis for candidacy for the 3 point shooting crown of the NBA. We are left with 158 players after filtering for this criteria.

df6 = pd.read_sql('''SELECT * FROM shot_finder''', conn)
df6[df6.columns[4:]] = df6[df6.columns[4:]].astype(float)
df6 = df6[df6['3PA'] > 200]
df7 = df6[['3PA', '3P%', "%Ast'd"]]

To be able to filter out the elite 3 point shooters in the league, we will be looking at three factors: the volume of shots, accuracy of their shots, and the percentage of shots they can create on their own. These stats correspond to 3PA, 3P%, and %Ast'd, respectively and reflect the three biggest factors that are relevant in the NBA today: the number, accuracy, and skill in creating and making 3 point shots. At the end of this clustering analysis, we will be able to bucket the different kind of three point shooters in the league.

As there are only 3 features to be used, we will forego the Principal Component Analysis section and go straight to clustering these players.

kmeans_3p = KMeans(random_state=1337)
out = cluster_range(df7.to_numpy(), kmeans_3p, 16, actual=None)
plot_internal(out['inertias'], out['chs'], out['iidrs'], out['scs']);

png

Based on the internal validation criteria above, we select 6 as the optimal number of clusters. This is chosen as the elbow point of the inter-intra cluster range and SSE, as well as maintaining a high silhouette score (although the difference between the max and min Silhouette score is only ~0.5).

# number of clusters
clusters3p = 6
y_predicted3p = out['ys'][clusters3p-2]

df6['y_predicted'] = y_predicted3p
print('Average Player Stats per Cluster:')
df6.groupby('y_predicted')[['3PA', '3P%', "%Ast'd"]].mean().transpose()
Average Player Stats per Cluster:
y_predicted012345
3PA653.909091378.472222229.42000465.0833331028.000305.25000
3P%0.3851820.3716670.351360.3580000.3680.36225
%Ast'd0.7140910.8187500.877900.7691670.1610.84625
print('Count of players per cluster:')
df6.groupby('y_predicted')[['3PA']].count().transpose()
Count of players per cluster:
y_predicted012345
3PA11365024136

The summary of each stat above shows the stark difference in the different clusters that were formed, with their being an outlier cluster in cluster 4. When we take a look at the number of players in each cluster, there is a clear outlier in the players this season with only one player belonging to cluster 4.

Based on the stats per cluster, we can see:

  • Cluster 2 - spot up shooters. These are the shooters who prefer staying on the perimiter and wait for the slashers or ball handlers to pass them the ball when they are open. This is evident in the low number of attempts, and the high assist rate for their three pointers.
  • Cluster 5 - mix of spot up and low volume shooters. These are players who are not traditionally purely three point shooters but can knock down the shot when called upon. Additionally, they have a high assist rate meaning that they are more likely also camped out in the perimeter but do venture out on their own.
  • Cluster 1 - spot up sharpshooters. These players are mostly assisted on their 3 point makes, but also shoot them at a high accuracy. As shown by their stats, they do not tend to shoot a lot of 3 pointers but are confident when they do.

For clusters 0, 3, and 4, we look into the breakdown of players per cluster to get a better understanding of the composition of each.

rel_cols = ['Player', 'Tm', 'G', '3PA', '3P%', "%Ast'd"]
df6[df6['y_predicted']==3].sort_values('3PA', ascending=False)[rel_cols].head(10)
PlayerTmG3PA3P%%Ast'd
426Blake GriffinDET75.0522.00.3620.561
206Jae CrowderUTA80.0522.00.3310.942
423Donovan MitchellUTA77.0519.00.3620.580
443Luka DončićDAL72.0514.00.3270.423
194Brook LopezMIL81.0512.00.3650.952
330Joe InglesUTA82.0483.00.3910.825
444Trae YoungATL80.0482.00.3240.423
374Tim HardawayTOT65.0477.00.3400.728
412Khris MiddletonMIL77.0474.00.3780.603
344Reggie JacksonDET82.0471.00.3690.793

For cluster3, we can see that these players are elite players who are able to create their own shot and this cluster is defined by 3 point shooters who are able to dribble and spot up and make their 3 pointers with high accuracy. In this cluster, we can see the great 3 point shooters in the league with the likes of Trae Young, Khris Middleton, and others.

df6[df6['y_predicted']==0].sort_values('3PA', ascending=False)[rel_cols].head()
PlayerTmG3PA3P%%Ast'd
383Stephen CurryGSW69.0810.00.4370.689
392Paul GeorgeOKC77.0757.00.3860.671
442Kemba WalkerCHO82.0731.00.3560.438
319Buddy HieldSAC82.0651.00.4270.842
439Damian LillardPOR80.0643.00.3690.460

In cluster0, we see the truly elite 3 point shooters in the league with star players such as Stephen Curry, Paul George, Kemba Walker, and Damian Lillard. These players are able to create their own shot and shoot at a very high volume while maintaining their accuracy. A point to note here is that Steph Curry is by far the best shooter of this cluster as he shoots more three pointers and has the highest accuracy, while still being able to create his own shots.

df6[df6['y_predicted']==4].sort_values('3PA', ascending=False).head()
RkPlayerSeasonTmGFGFGAFG%FGX3P3PA3P%3PXeFG%Ast'd%Ast'dy_predicted
449450James Harden2018-19HOU78.0378.01028.00.368650.0378.01028.00.368650.00.55261.00.1614

In cluster4, we find the true outlier of the 2018-19 NBA season in 3 pointers. James Harden not only shot the most 3 point attempts but also managed to shoot at a relatively high 36.8%, while also having the lowest assisted shot percentage which shows that he is taking most of these 3 point shots off his own dribbles. This also most likely means that these shots are contested as he would be holding the ball at the time before his shot. In this cluster, he is the only player as he is by far the player who shot the most 3’s and has the lowest assisted 3 point rate.

X_players_new3p = TSNE(n_components=2,random_state=1337).fit_transform(df7.to_numpy())
fig, ax = plt.subplots()
ax.scatter(X_players_new3p[:,0], X_players_new3p[:,1], c=list(y_predicted3p), 
           alpha=0.5)
ax.set_title('TSNE Projection of Clusters');

png

Lastly, when we plot the clusters on the tSNE representation of the dataset, we see that there is a very good separation between the clusters as there are no overlaps that can be seen. In the lower left quadrant, we see the very elite shooters, with one outlier (James Harden). In testing out the different values for k, it showed that even at different values of k, James Harden still clusters on his own as his stats are above and beyond the others for this season.

Appendix

Players per 3 Point Cluster (clusters 2, 3, 5)

Cluster 2

df6[df6['y_predicted']==2].sort_values('3PA', ascending=False).head()
RkPlayerSeasonTmGFGFGAFG%FGX3P3PA3P%3PXeFG%Ast'd%Ast'dy_predicted
144145Jonathan Isaac2018-19ORL72.086.0266.00.323180.086.0266.00.323180.00.48585.00.9882
224225Garrett Temple2018-19TOT72.090.0264.00.341174.090.0264.00.341174.00.51184.00.9332
236237Joel Embiid2018-19PHI61.079.0263.00.300184.079.0263.00.300184.00.45173.00.9242
428429Dwyane Wade2018-19MIA68.086.0261.00.330175.086.0261.00.330175.00.49447.00.5472
190191Tyler Johnson2018-19TOT56.090.0260.00.346170.090.0260.00.346170.00.51986.00.9562

Cluster 3

df6[df6['y_predicted']==3].sort_values('3PA', ascending=False).head()
RkPlayerSeasonTmGFGFGAFG%FGX3P3PA3P%3PXeFG%Ast'd%Ast'dy_predicted
426427Blake Griffin2018-19DET75.0189.0522.00.362333.0189.0522.00.362333.00.543106.00.5613
206207Jae Crowder2018-19UTA80.0173.0522.00.331349.0173.0522.00.331349.00.497163.00.9423
423424Donovan Mitchell2018-19UTA77.0188.0519.00.362331.0188.0519.00.362331.00.543109.00.5803
443444Luka Dončić2018-19DAL72.0168.0514.00.327346.0168.0514.00.327346.00.49071.00.4233
194195Brook Lopez2018-19MIL81.0187.0512.00.365325.0187.0512.00.365325.00.548178.00.9523

Cluster 5

df6[df6['y_predicted']==5].sort_values('3PA', ascending=False).head()
RkPlayerSeasonTmGFGFGAFG%FGX3P3PA3P%3PXeFG%Ast'd%Ast'dy_predicted
146147Dāvis Bertāns2018-19SAS76.0145.0338.00.429193.0145.0338.00.429193.00.643143.00.9865
296297Kelly Oubre2018-19TOT69.0108.0338.00.320230.0108.0338.00.320230.00.47994.00.8705
369370Terry Rozier2018-19BOS78.0119.0337.00.353218.0119.0337.00.353218.00.53088.00.7395
163164Lauri Markkanen2018-19CHI52.0120.0332.00.361212.0120.0332.00.361212.00.542117.00.9755
272273Malik Monk2018-19CHO71.0109.0330.00.330221.0109.0330.00.330221.00.49598.00.8995