From f73aed7783d290a836b490654cffa59aa8abff36 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Tue, 26 Nov 2024 18:36:26 +0100
Subject: [PATCH 01/10] intermediate

---
 unravel/american_football/graphs/dataset.py   |   4 +-
 .../graphs/graph_converter.py                 |  11 +-
 unravel/soccer/graphs/__init__.py             |   4 +
 unravel/soccer/graphs/dataset.py              | 330 +++++++++++++++
 unravel/soccer/graphs/features/__init__.py    |   4 +
 .../graphs/features/adjacency_matrix_pl.py    |  42 ++
 .../graphs/features/edge_features_pl.py       | 185 ++++++++
 .../graphs/features/node_features_pl.py       | 244 +++++++++++
 unravel/soccer/graphs/graph_converter_pl.py   | 400 ++++++++++++++++++
 unravel/soccer/graphs/graph_settings_pl.py    |  38 ++
 unravel/utils/features/utils.py               |  41 ++
 unravel/utils/objects/__init__.py             |   1 +
 unravel/utils/objects/default_dataset.py      |  13 +
 13 files changed, 1310 insertions(+), 7 deletions(-)
 create mode 100644 unravel/soccer/graphs/dataset.py
 create mode 100644 unravel/soccer/graphs/features/adjacency_matrix_pl.py
 create mode 100644 unravel/soccer/graphs/features/edge_features_pl.py
 create mode 100644 unravel/soccer/graphs/features/node_features_pl.py
 create mode 100644 unravel/soccer/graphs/graph_converter_pl.py
 create mode 100644 unravel/soccer/graphs/graph_settings_pl.py
 create mode 100644 unravel/utils/objects/default_dataset.py

diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py
index 93368f5..fdb7310 100644
--- a/unravel/american_football/graphs/dataset.py
+++ b/unravel/american_football/graphs/dataset.py
@@ -7,11 +7,11 @@
 import numpy as np
 
 from .graph_settings import AmericanFootballPitchDimensions, Dimension, Unit
-from ...utils import add_dummy_label_column, add_graph_id_column
+from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column
 
 
 @dataclass
-class BigDataBowlDataset:
+class BigDataBowlDataset(DefaultDataset):
     tracking_file_path: str
     players_file_path: str
     plays_file_path: str
diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py
index 2f7bfd9..172164d 100644
--- a/unravel/american_football/graphs/graph_converter.py
+++ b/unravel/american_football/graphs/graph_converter.py
@@ -74,11 +74,6 @@ def _sport_specific_checks(self):
         if not isinstance(self.chunk_size, int):
             raise Exception("chunk_size should be of type integer (int)")
 
-        if not isinstance(self.attacking_non_qb_node_value, (int, float)):
-            raise Exception(
-                "'attacking_non_qb_node_value' should be of type float or integer (int)"
-            )
-
         if not self.label_col in self.dataset.columns and not self.prediction:
             raise Exception(
                 "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on."
@@ -89,6 +84,12 @@ def _sport_specific_checks(self):
                 "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
             )
 
+        # Parameter Checks
+        if not isinstance(self.attacking_non_qb_node_value, (int, float)):
+            raise Exception(
+                "'attacking_non_qb_node_value' should be of type float or integer (int)"
+            )
+
     def _apply_settings(self):
         return AmericanFootballGraphSettings(
             pitch_dimensions=self.pitch_dimensions,
diff --git a/unravel/soccer/graphs/__init__.py b/unravel/soccer/graphs/__init__.py
index 905585c..bd44fac 100644
--- a/unravel/soccer/graphs/__init__.py
+++ b/unravel/soccer/graphs/__init__.py
@@ -1,5 +1,9 @@
 from .graph_converter import SoccerGraphConverter
+from .graph_converter_pl import SoccerGraphConverterPL
 from .graph_settings import SoccerGraphSettings
+from .graph_settings_pl import GraphSettingsPL
 from .graph_frame import GraphFrame
 from .exceptions import *
 from .features import *
+
+from .dataset import KloppyDataset
diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py
new file mode 100644
index 0000000..459b1d6
--- /dev/null
+++ b/unravel/soccer/graphs/dataset.py
@@ -0,0 +1,330 @@
+from kloppy.domain import (
+    TrackingDataset,
+    Frame,
+    Orientation,
+    DatasetTransformer,
+    DatasetFlag,
+    SecondSpectrumCoordinateSystem,
+)
+
+from typing import List, Dict, Union
+
+from dataclasses import field, dataclass
+
+from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column
+
+import polars as pl
+
+
+DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 2}
+DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 2}
+
+
+@dataclass
+class SoccerObject:
+    id: Union[str, int]
+    team_id: Union[str, int]
+    position_name: str
+
+
+@dataclass
+class KloppyDataset(DefaultDataset):
+    kloppy_dataset: TrackingDataset
+    _identifier_column: str = field(default="id", init=False)
+    _partition_by: List[str] = field(
+        default_factory=lambda: ["id", "period_id"], init=False
+    )
+
+    def __transform_orientation(self):
+        if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM:
+            to_orientation = Orientation.STATIC_HOME_AWAY
+        else:
+            to_orientation = Orientation.BALL_OWNING_TEAM
+
+        self.kloppy_dataset = DatasetTransformer.transform_dataset(
+            dataset=self.kloppy_dataset,
+            to_orientation=to_orientation,
+            to_coordinate_system=SecondSpectrumCoordinateSystem(
+                pitch_length=self.kloppy_dataset.metadata.pitch_dimensions.pitch_length,
+                pitch_width=self.kloppy_dataset.metadata.pitch_dimensions.pitch_width,
+            ),
+        )
+        return self.kloppy_dataset
+
+    def __get_objects(self):
+        home_team, away_team = self.kloppy_dataset.metadata.teams
+
+        home_players = [
+            SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
+            for p in home_team.players
+        ]
+        away_players = [
+            SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
+            for p in away_team.players
+        ]
+        ball_object = SoccerObject("ball", None, "ball")
+        game_id = self.kloppy_dataset.metadata.game_id
+        return (home_players, away_players, ball_object, game_id)
+
+    def __unpivot(self, object, coordinate):
+        column = f"{object.id}_{coordinate}"
+
+        return self.data.unpivot(
+            index=[
+                "period_id",
+                "timestamp",
+                "frame_id",
+                "ball_state",
+                "ball_owning_team_id",
+            ],  # Columns to keep
+            on=[column],
+            value_name=coordinate,
+            variable_name=self._identifier_column,
+        ).with_columns(
+            pl.col(self._identifier_column).str.replace(
+                f"_{coordinate}", ""
+            )  # Remove the coordinate suffix
+        )
+
+    def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict):
+        try:
+            from scipy.signal import savgol_filter
+        except ImportError:
+            raise ImportError(
+                "Seems like you don't have scipy installed. Please"
+                " install it using: pip install scipy"
+            )
+
+        if not smoothing_params.get("window_length"):
+            raise ValueError(
+                "Missing parameter 'window_length' in player_smoothing_params and/or ball_smoothing_params"
+            )
+        if not smoothing_params.get("polyorder"):
+            raise ValueError(
+                "Missing parameter 'polyorder' in player_smoothing_params and/or ball_smoothing_params"
+            )
+
+        smoothed = df.group_by(self._partition_by, maintain_order=True).agg(
+            [
+                pl.col("vx")
+                .map_elements(
+                    lambda vx: savgol_filter(
+                        vx,
+                        window_length=smoothing_params["window_length"],
+                        polyorder=smoothing_params["polyorder"],
+                    ).tolist(),
+                    return_dtype=pl.List(pl.Float64),
+                )
+                .alias("vx_smoothed"),
+                pl.col("vy")
+                .map_elements(
+                    lambda vy: savgol_filter(
+                        vy,
+                        window_length=smoothing_params["window_length"],
+                        polyorder=smoothing_params["polyorder"],
+                    ).tolist(),
+                    return_dtype=pl.List(pl.Float64),
+                )
+                .alias("vy_smoothed"),
+                pl.col("vz")
+                .map_elements(
+                    lambda vy: savgol_filter(
+                        vy,
+                        window_length=smoothing_params["window_length"],
+                        polyorder=smoothing_params["polyorder"],
+                    ).tolist(),
+                    return_dtype=pl.List(pl.Float64),
+                )
+                .alias("vz_smoothed"),
+            ]
+        )
+        # Explode the smoothed columns back to original shape
+        smoothed_exploded = smoothed.explode(
+            ["vx_smoothed", "vy_smoothed", "vz_smoothed"]
+        )
+        # Combine with the original DataFrame if needed
+        return df.with_columns(
+            vx=smoothed_exploded["vx_smoothed"],
+            vy=smoothed_exploded["vy_smoothed"],
+            vz=smoothed_exploded["vz_smoothed"],
+        )
+
+    def __add_velocity(
+        self,
+        df: pl.DataFrame,
+        player_smoothing_params: dict,
+        ball_smoothing_params: dict,
+    ):
+        df = (
+            df.sort(["id", "period_id", "timestamp", "team_id"], nulls_last=True)
+            .with_columns(
+                [
+                    # Calculate differences within each group
+                    pl.col("x").diff().over(self._partition_by).alias("dx"),
+                    pl.col("y").diff().over(self._partition_by).alias("dy"),
+                    pl.col("z").diff().over(self._partition_by).alias("dz"),
+                    (pl.col("timestamp").dt.total_milliseconds() / 1_000)
+                    .diff()
+                    .over(self._partition_by)
+                    .alias("dt"),
+                ]
+            )
+            .with_columns(
+                [
+                    # Compute velocity components
+                    (pl.col("dx") / pl.col("dt")).alias("vx"),
+                    (pl.col("dy") / pl.col("dt")).alias("vy"),
+                    (pl.col("dz") / pl.col("dt")).alias("vz"),
+                ]
+            )
+            .with_columns(
+                [
+                    # Fill null values in vx and vy
+                    pl.col("vx").fill_null(0).alias("vx"),
+                    pl.col("vy").fill_null(0).alias("vy"),
+                    pl.col("vz").fill_null(0).alias("vz"),
+                ]
+            )
+        )
+
+        if player_smoothing_params:
+            player_df = self.__apply_smoothing(
+                df=df.filter(pl.col(self._identifier_column) != self._ball_object.id),
+                smoothing_params=player_smoothing_params,
+            )
+        else:
+            player_df = df.filter(
+                pl.col(self._identifier_column) != self._ball_object.id
+            )
+
+        if ball_smoothing_params:
+            ball_df = self.__apply_smoothing(
+                df.filter(pl.col(self._identifier_column) == self._ball_object.id),
+                smoothing_params=ball_smoothing_params,
+            )
+        else:
+            ball_df = df.filter(pl.col(self._identifier_column) == self._ball_object.id)
+        df = pl.concat([player_df, ball_df])
+        df = df.with_columns(
+            [
+                (pl.col("vx") ** 2 + pl.col("vy") ** 2 + pl.col("vz") ** 2)
+                .sqrt()
+                .alias("v")
+            ]
+        )
+
+        return df
+
+    def __add_acceleration(self, df: pl.DataFrame):
+        df = (
+            df.with_columns(
+                [
+                    # Calculate differences in vx, vy, and dt for acceleration
+                    pl.col("vx").diff().over(self._partition_by).alias("dvx"),
+                    pl.col("vy").diff().over(self._partition_by).alias("dvy"),
+                    pl.col("vz").diff().over(self._partition_by).alias("dvz"),
+                ]
+            )
+            .with_columns(
+                [
+                    # Compute ax and ay
+                    (pl.col("dvx") / pl.col("dt")).alias("ax"),
+                    (pl.col("dvy") / pl.col("dt")).alias("ay"),
+                    (pl.col("dvz") / pl.col("dt")).alias("az"),
+                ]
+            )
+            .with_columns(
+                [
+                    # Fill null values in vx and vy
+                    pl.col("ax").fill_null(0).alias("ax"),
+                    pl.col("ay").fill_null(0).alias("ay"),
+                    pl.col("az").fill_null(0).alias("az"),
+                ]
+            )
+            .with_columns(
+                [
+                    # Compute magnitude of acceleration a
+                    (pl.col("ax") ** 2 + pl.col("ay") ** 2 + pl.col("az") ** 2)
+                    .sqrt()
+                    .alias("a")
+                ]
+            )
+        )
+        return df
+
+    def __melt(
+        self,
+        home_players: List[SoccerObject],
+        away_players: List[SoccerObject],
+        ball_object: SoccerObject,
+        game_id: Union[int, str],
+    ):
+        melted_dfs = []
+        columns = self.data.columns
+
+        for object in [ball_object] + home_players + away_players:
+            melted_object_dfs = []
+            for k, coordinate in enumerate(["x", "y", "z"]):
+                if object.id != "ball" and coordinate == "z":
+                    continue
+                if not any(object.id in column for column in columns):
+                    continue
+
+                melted_df = self.__unpivot(object, coordinate)
+                if k == 0:
+                    melted_object_dfs.append(melted_df)
+                else:
+                    melted_object_dfs.append(melted_df[[coordinate]])
+
+            if melted_object_dfs:
+                object_df = pl.concat(melted_object_dfs, how="horizontal")
+                if "z" not in object_df.columns:
+                    object_df = object_df.with_columns([pl.lit(0.0).alias("z")])
+                object_df = object_df.with_columns(
+                    [
+                        pl.lit(object.team_id).cast(pl.Utf8).alias("team_id"),
+                        pl.lit(object.position_name).alias("position_name"),
+                    ]
+                )
+
+                melted_dfs.append(object_df)
+
+        df = pl.concat(melted_dfs, how="vertical")
+        df = df.with_columns([pl.lit(game_id).alias("game_id")])
+        df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True)
+        return df
+
+    def load(
+        self,
+        player_smoothing_params: Union[dict, None] = DEFAULT_PLAYER_SMOOTHING_PARAMS,
+        ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS,
+    ):
+        self.kloppy_dataset = self.__transform_orientation()
+        self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions
+
+        self.data = self.kloppy_dataset.to_df(engine="polars")
+        (self._home_players, self._away_players, self._ball_object, self._game_id) = (
+            self.__get_objects()
+        )
+        df = self.__melt(
+            self._home_players, self._away_players, self._ball_object, self._game_id
+        )
+        df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params)
+        df = self.__add_acceleration(df)
+        self.data = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"])
+
+        return self.data, self.pitch_dimensions
+
+    def add_dummy_labels(
+        self,
+        by: List[str] = ["game_id", "frame_id"],
+        column_name: str = "label",
+    ) -> pl.DataFrame:
+        self.data = add_dummy_label_column(self.data, by, column_name)
+        return self.data
+
+    def add_graph_ids(
+        self, by: List[str] = ["game_id", "period_id"], column_name: str = "graph_id"
+    ) -> pl.DataFrame:
+        self.data = add_graph_id_column(self.data, by, column_name)
+        return self.data
diff --git a/unravel/soccer/graphs/features/__init__.py b/unravel/soccer/graphs/features/__init__.py
index 0a8744a..4135270 100644
--- a/unravel/soccer/graphs/features/__init__.py
+++ b/unravel/soccer/graphs/features/__init__.py
@@ -1,3 +1,7 @@
 from .adjacency_matrix import adjacency_matrix, delaunay_adjacency_matrix
 from .edge_features import edge_features
 from .node_features import node_features
+
+from .adjacency_matrix_pl import compute_adjacency_matrix_pl
+from .edge_features_pl import compute_edge_features_pl
+from .node_features_pl import compute_node_features_pl
diff --git a/unravel/soccer/graphs/features/adjacency_matrix_pl.py b/unravel/soccer/graphs/features/adjacency_matrix_pl.py
new file mode 100644
index 0000000..7a5b2d2
--- /dev/null
+++ b/unravel/soccer/graphs/features/adjacency_matrix_pl.py
@@ -0,0 +1,42 @@
+import numpy as np
+from scipy.spatial import Delaunay
+
+
+from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType, distance_to_ball
+
+
+def compute_adjacency_matrix_pl(team, possession_team, settings, ball_carrier_idx):
+    adjacency_matrix_type = settings.adjacency_matrix_type
+    adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type
+    ball_id = settings.ball_id
+
+    exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)])
+    defensive_team = np.setdiff1d(team, exclusion_ids)[0]
+    if adjacency_matrix_type == AdjacencyMatrixType.DENSE:
+        adjacency_matrix = np.ones((team.shape[0], team.shape[0])).astype(np.int32)
+    elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_AP:
+        is_att = team == np.unique(possession_team)[0]
+        adjacency_matrix = np.outer(is_att, is_att).astype(int)
+    elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_DP:
+        is_def = team == defensive_team
+        adjacency_matrix = np.outer(is_def, is_def).astype(int)
+    elif adjacency_matrix_type == AdjacencyMatrixType.SPLIT_BY_TEAM:
+        # Create a pairwise team comparison matrix
+        adjacency_matrix = np.equal(team[:, None], team[None, :]).astype(np.int32)
+    elif adjacency_matrix_type == AdjacencyMatrixType.DELAUNAY:
+        raise NotImplementedError("Delaunay matrix not implemented for Soccer...")
+    else:
+        raise NotImplementedError("Please specify an existing AdjacencyMatrixType...")
+
+    if adjacency_matrix_connect_type:
+        # Create a mask where either team is "ball"
+        ball_mask = (team[:, None] == ball_id) | (team[None, :] == ball_id)
+        if adjacency_matrix_connect_type == AdjacenyMatrixConnectType.BALL:
+            # Set entries to 1 where either team is "ball"
+            adjacency_matrix = np.where(ball_mask, 1, adjacency_matrix)
+        elif adjacency_matrix_connect_type == AdjacenyMatrixConnectType.BALL_CARRIER:
+            if ball_carrier_idx is not None:
+                adjacency_matrix[ball_carrier_idx, ball_mask[ball_carrier_idx, :]] = 1
+                adjacency_matrix[ball_mask[:, ball_carrier_idx], ball_carrier_idx] = 1
+
+    return adjacency_matrix
diff --git a/unravel/soccer/graphs/features/edge_features_pl.py b/unravel/soccer/graphs/features/edge_features_pl.py
new file mode 100644
index 0000000..3852e6d
--- /dev/null
+++ b/unravel/soccer/graphs/features/edge_features_pl.py
@@ -0,0 +1,185 @@
+import numpy as np
+
+from ....utils import (
+    normalize_distance,
+    normalize_speed,
+    normalize_sincos,
+    angle_between,
+    non_zeros,
+    reindex,
+)
+
+import numpy as np
+
+from ....utils import (
+    normalize_distance,
+    normalize_sincos,
+    non_zeros,
+    reindex,
+    normalize_speed_differences_nfl,
+    normalize_accelerations_nfl,
+)
+
+
+def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, settings):
+    # Compute pairwise distances using broadcasting
+    max_dist_to_player = np.sqrt(
+        settings.pitch_dimensions.pitch_length**2
+        + settings.pitch_dimensions.pitch_width**2
+    )
+
+    distances_between_players = np.linalg.norm(
+        p3d[:, None, :] - p3d[None, :, :], axis=-1
+    )
+    dist_matrix_normed = normalize_distance(
+        distances_between_players, max_distance=max_dist_to_player
+    )  # 11x11
+
+    speed_diff_matrix = np.nan_to_num(s[None, :] - s[:, None])  # NxNx1
+    speed_diff_matrix_normed = normalize_speed_differences_nfl(
+        s=speed_diff_matrix,
+        team=team,
+        settings=settings,
+    )
+
+    vect_to_player_matrix = p2d[:, None, :] - p2d[None, :, :]  # NxNx2
+
+    v_normed_matrix = velocity[None, :, :] - velocity[:, None, :]  # 11x11x2
+
+    vect_to_player_matrix = (
+        p2d[:, None, :] - p2d[None, :, :]
+    )  # 11x11x2 the vector between two players
+
+    # Angles between players in sin and cos
+    angle_pos_matrix = np.nan_to_num(
+        np.arctan2(vect_to_player_matrix[:, :, 1], vect_to_player_matrix[:, :, 0])
+    )
+    pos_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_pos_matrix)))
+    pos_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_pos_matrix)))
+
+    combined_matrix = np.concatenate((vect_to_player_matrix, v_normed_matrix), axis=2)
+    angle_vel_matrix = np.apply_along_axis(angle_between, 2, combined_matrix)
+    vel_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_vel_matrix)))
+    vel_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_vel_matrix)))
+
+    nan_mask = np.isnan(distances_between_players)
+    non_zero_idxs, len_a = non_zeros(A=adjacency_matrix)
+
+    dist_matrix_normed[nan_mask] = 0
+    speed_diff_matrix_normed[nan_mask] = 0
+
+    pos_cos_matrix[nan_mask] = 0
+    pos_sin_matrix[nan_mask] = 0
+
+    e_tuple = list(
+        [
+            reindex(dist_matrix_normed, non_zero_idxs, len_a),
+            reindex(speed_diff_matrix_normed, non_zero_idxs, len_a),
+            reindex(pos_cos_matrix, non_zero_idxs, len_a),
+            reindex(pos_sin_matrix, non_zero_idxs, len_a),
+            reindex(vel_cos_matrix, non_zero_idxs, len_a),
+            reindex(vel_sin_matrix, non_zero_idxs, len_a),
+        ]
+    )
+
+    e = np.concatenate(e_tuple, axis=1)
+    return np.nan_to_num(e)
+
+
+# def edge_features(
+#     attacking_players,
+#     defending_players,
+#     ball,
+#     max_player_speed,
+#     max_ball_speed,
+#     pitch_dimensions,
+#     adjacency_matrix,
+#     delaunay_adjacency_matrix,
+# ):
+#     """
+#     # edge features matrix is (np.non_zero(a), n_edge_features) (nz, n_edge_features)
+#     # so for every connected edge in the adjacency matrix (a) we have 1 row of features describing that edge
+#     # to do this we compute all values for a single feature in a <=23x23 square matrix
+#     # reshape it to a (<=23**2, ) matrix and then mask all values that are 0 in `a` (nz)
+#     # then we concat all the features into a single (nz, n_edge_features) matrix
+#     """
+
+#     max_dist_to_player = np.sqrt(
+#         pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2
+#     )
+
+#     players1 = players2 = attacking_players + defending_players + [ball]
+
+#     h_pos = np.asarray([p.position for p in players1])
+#     a_pos = np.asarray([p.position for p in players2])
+
+#     h_vel = np.asarray([p.velocity for p in players1])
+#     a_vel = np.asarray([p.velocity for p in players2])
+
+#     h_spe = np.asarray([p.speed for p in players1])
+#     a_spe = np.asarray([p.speed for p in players2])
+
+#     distances_between_players = np.linalg.norm(
+#         h_pos[:, None, :] - a_pos[None, :, :], axis=-1
+#     )
+#     nan_mask = np.isnan(distances_between_players)
+
+#     dist_matrix = normalize_distance(
+#         distances_between_players, max_distance=max_dist_to_player
+#     )  # 11x11
+
+#     speed_diff_matrix = np.nan_to_num(
+#         normalize_speed(a_spe[None, :], max_speed=max(max_player_speed, max_ball_speed))
+#         - normalize_speed(
+#             h_spe[:, None], max_speed=max(max_player_speed, max_ball_speed)
+#         )
+#     )  # 11x11x1
+
+#     vect_to_player_matrix = (
+#         h_pos[:, None, :] - a_pos[None, :, :]
+#     )  # 11x11x2 the vector between two players
+#     v_normed_matrix = a_vel[None, :, :] - h_vel[:, None, :]  # 11x11x2
+
+#     angle_pos_matrix = np.nan_to_num(
+#         np.arctan2(vect_to_player_matrix[:, :, 1], vect_to_player_matrix[:, :, 0])
+#     )
+#     pos_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_pos_matrix)))
+#     pos_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_pos_matrix)))
+
+#     combined_matrix = np.concatenate((vect_to_player_matrix, v_normed_matrix), axis=2)
+#     angle_vel_matrix = np.apply_along_axis(angle_between, 2, combined_matrix)
+#     vel_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_vel_matrix)))
+#     vel_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_vel_matrix)))
+
+#     non_zero_idxs, len_a = non_zeros(A=adjacency_matrix)
+#     # create a matrix where 1 if edge is same team else 0
+
+#     # if we have nan values we mask them to 0.
+#     # this only happens when we pad additional players
+#     dist_matrix[nan_mask] = 0
+#     speed_diff_matrix[nan_mask] = 0
+#     pos_cos_matrix[nan_mask] = 0
+#     pos_sin_matrix[nan_mask] = 0
+#     vel_cos_matrix[nan_mask] = 0
+#     vel_sin_matrix[nan_mask] = 0
+
+#     e_tuple = list(
+#         [
+#             # same_team_matrix[non_zero_idxs].reshape(len_a, 1),
+#             reindex(dist_matrix, non_zero_idxs, len_a),
+#             reindex(speed_diff_matrix, non_zero_idxs, len_a),
+#             reindex(pos_cos_matrix, non_zero_idxs, len_a),
+#             reindex(pos_sin_matrix, non_zero_idxs, len_a),
+#             reindex(vel_cos_matrix, non_zero_idxs, len_a),
+#             reindex(vel_sin_matrix, non_zero_idxs, len_a),
+#         ]
+#     )
+
+#     if delaunay_adjacency_matrix is not None:
+#         # if we are not using Delaunay as adjacency matrix,
+#         # use it as edge features to indicate "clear passing lines"
+#         extra_tuple = list([reindex(delaunay_adjacency_matrix, non_zero_idxs, len_a)])
+#         e_tuple.extend(extra_tuple)
+
+#     e = np.concatenate(e_tuple, axis=1)
+#     return np.nan_to_num(e)
diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py
new file mode 100644
index 0000000..c1132b3
--- /dev/null
+++ b/unravel/soccer/graphs/features/node_features_pl.py
@@ -0,0 +1,244 @@
+import math
+import numpy as np
+
+from ....utils import (
+    normalize_coords,
+    normalize_speeds_nfl,
+    normalize_sincos,
+    normalize_distance,
+    unit_vector_from_angle,
+    normalize_speeds_nfl,
+    normalize_accelerations_nfl,
+    normalize_between,
+    unit_vector,
+    unit_vectors,
+    normalize_angles,
+    normalize_distance,
+    normalize_coords,
+    normalize_speed,
+    distance_to_ball,
+)
+
+
+def compute_node_features_pl(
+    x,
+    y,
+    s,
+    velocity,
+    team,
+    possession_team,
+    is_gk,
+    settings,
+):
+    ball_id = settings.ball_id
+
+    goal_mouth_position = (
+        settings.pitch_dimensions.x_dim.max,
+        (settings.pitch_dimensions.y_dim.max + settings.pitch_dimensions.y_dim.min) / 2,
+    )
+    max_dist_to_player = np.sqrt(
+        settings.pitch_dimensions.pitch_length**2
+        + settings.pitch_dimensions.pitch_width**2
+    )
+    max_dist_to_goal = np.sqrt(
+        settings.pitch_dimensions.pitch_length**2
+        + settings.pitch_dimensions.pitch_width**2
+    )
+
+    position, ball_position, dist_to_ball = distance_to_ball(
+        x=x, y=y, team=team, ball_id=ball_id
+    )
+
+    x_normed = normalize_between(
+        value=x,
+        max_value=settings.pitch_dimensions.x_dim.max,
+        min_value=settings.pitch_dimensions.x_dim.min,
+    )
+    y_normed = normalize_between(
+        value=y,
+        max_value=settings.pitch_dimensions.y_dim.max,
+        min_value=settings.pitch_dimensions.y_dim.min,
+    )
+    s_normed = normalize_speeds_nfl(s, team, settings)
+    uv_velocity = unit_vectors(velocity)
+
+    angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0]))
+    sin_normed = normalize_sincos(np.sin(angles))
+    cos_normed = normalize_sincos(np.cos(angles))
+
+    dist_to_goal = np.linalg.norm(position - goal_mouth_position, axis=1)
+    normed_dist_to_goal = normalize_distance(
+        value=dist_to_goal, max_distance=max_dist_to_goal
+    )
+
+    normed_dist_to_ball = normalize_distance(
+        value=dist_to_ball, max_distance=max_dist_to_player
+    )
+
+    is_possession_team = np.where(
+        team == possession_team, 1, settings.defending_team_node_value
+    )
+
+    is_ball = np.where(team == ball_id, 1, 0)
+
+    X = np.nan_to_num(
+        np.stack(
+            (
+                x_normed,
+                y_normed,
+                uv_velocity[:, 0],
+                uv_velocity[:, 1],
+                s_normed,
+                sin_normed,
+                cos_normed,
+                normed_dist_to_goal,
+                normed_dist_to_ball,
+                is_possession_team,
+                is_gk,
+                is_ball,
+            ),
+            axis=-1,
+        )
+    )
+
+    return X
+
+
+# def node_features(
+#     attacking_players,
+#     defending_players,
+#     ball,
+#     max_player_speed,
+#     max_ball_speed,
+#     ball_carrier_idx,
+#     pitch_dimensions,
+#     include_ball_node: bool = True,
+#     defending_team_node_value: float = 0.1,
+#     non_potential_receiver_node_value: float = 0.1,
+# ):
+#     """
+#     node features matrix is (n_nodes, n_node_features) (<=23, 17)
+#     each player (and optionally ball) is a node
+
+#     player_features n_node_features must be equal to ball_features n_node_features
+#     """
+
+#     goal_mouth_position = (
+#         pitch_dimensions.pitch_length,
+#         pitch_dimensions.pitch_width / 2,
+#     )
+#     max_dist_to_player = np.sqrt(
+#         pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2
+#     )
+#     max_dist_to_goal = np.sqrt(
+#         pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2
+#     )
+
+#     def player_features(p, team, potential_receiver=None):
+#         ball_angle = math.atan2(p.y1 - ball.y1, p.x1 - ball.x1)
+#         goal_angle = math.atan2(
+#             p.y1 - goal_mouth_position[0], p.x1 - goal_mouth_position[1]
+#         )
+
+#         player_node_features = [
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else normalize_coords(p.x1, pitch_dimensions.x_dim.max)
+#             ),
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else normalize_coords(p.y1, pitch_dimensions.y_dim.max)
+#             ),
+#             0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0],
+#             0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[1],
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else round(normalize_speed(p.speed, max_speed=max_player_speed), 3)
+#             ),
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else normalize_angles(np.arctan2(p.velocity[1], p.velocity[0]))
+#             ),
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else normalize_distance(
+#                     np.linalg.norm(p.position - goal_mouth_position),
+#                     max_distance=max_dist_to_goal,
+#                 )
+#             ),  # distance to the goal mouth
+#             0.0 if np.isnan(p.x1) else normalize_angles(goal_angle),
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else normalize_distance(
+#                     np.linalg.norm(p.position - ball.position),
+#                     max_distance=max_dist_to_player,
+#                 )
+#             ),  # distance to the ball
+#             0.0 if np.isnan(p.x1) else normalize_angles(ball_angle),
+#             0.0 if np.isnan(p.x1) else team,
+#             # 1 if player is on same team but not in possession, 0.1 for all other players, 0.1 if the player is 'missing'
+#             (
+#                 0.0
+#                 if np.isnan(p.x1)
+#                 else 1.0 if potential_receiver else non_potential_receiver_node_value
+#             ),
+#         ]
+#         return player_node_features
+
+#     def ball_features(ball):
+#         goal_angle = math.atan2(
+#             ball.y1 - goal_mouth_position[1], ball.x1 - goal_mouth_position[0]
+#         )
+#         ball_node_features = [
+#             normalize_coords(ball.x1, pitch_dimensions.x_dim.max),
+#             normalize_coords(ball.y1, pitch_dimensions.y_dim.max),
+#             unit_vector(ball.velocity)[0],
+#             unit_vector(ball.velocity)[1],
+#             round(normalize_speed(ball.speed, max_speed=max_ball_speed), 3),
+#             normalize_angles(np.arctan2(ball.velocity[1], ball.velocity[0])),
+#             normalize_distance(
+#                 np.linalg.norm(ball.position - goal_mouth_position),
+#                 max_distance=max_dist_to_goal,
+#             ),  # distance to the goal mouth
+#             normalize_angles(goal_angle),
+#             # ball_angle 2x, ball_dist 2x, attacking_team 2x, ball carrier, potential receiver (all always 0 for ball)
+#             0,
+#             0,
+#             0,
+#             0,  # , 0
+#         ]
+
+#         return np.asarray([ball_node_features])
+
+#     # loop over attacking players, grab ball_carrier, potential receiver and intended receiver
+#     ap_features = np.asarray(
+#         [
+#             player_features(p, team=1, potential_receiver=(i != ball_carrier_idx))
+#             for i, p in enumerate(attacking_players)
+#         ]
+#     )
+
+#     # loop over defending playres, we don't have ball_carrier, or receivers
+#     dp_features = np.asarray(
+#         [
+#             player_features(p, team=defending_team_node_value)
+#             for i, p in enumerate(defending_players)
+#         ]
+#     )
+
+#     # compute ball features
+#     b_features = ball_features(ball)
+#     X = np.append(ap_features, dp_features, axis=0)
+
+#     if include_ball_node:
+#         X = np.append(X, b_features, axis=0)
+
+#     # convert np.NaN to 0 (zero)
+#     X = np.nan_to_num(X)
+#     return X
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
new file mode 100644
index 0000000..ad694f4
--- /dev/null
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -0,0 +1,400 @@
+import logging
+import sys
+from copy import deepcopy
+
+import pandas as pd
+
+import warnings
+
+from dataclasses import dataclass, field, asdict
+
+from typing import List, Union, Dict, Literal
+
+from kloppy.domain import (
+    TrackingDataset,
+    Frame,
+    Orientation,
+    DatasetTransformer,
+    DatasetFlag,
+    SecondSpectrumCoordinateSystem,
+    MetricPitchDimensions,
+)
+
+from spektral.data import Graph
+
+from .exceptions import (
+    MissingLabelsError,
+    MissingDatasetError,
+    IncorrectDatasetTypeError,
+    KeyMismatchError,
+)
+
+from .graph_settings_pl import GraphSettingsPL
+from .dataset import KloppyDataset
+from .features import (
+    compute_node_features_pl,
+    compute_adjacency_matrix_pl,
+    compute_edge_features_pl,
+)
+
+from ...utils import *
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+stdout_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stdout_handler)
+
+
+@dataclass(repr=True)
+class SoccerGraphConverterPL(DefaultGraphConverter):
+    """
+    Converts our dataset TrackingDataset into an internal structure
+
+    Attributes:
+        dataset (TrackingDataset): Kloppy TrackingDataset.
+        labels (dict): Dict with a key per frame_id, like so {frame_id: True/False/1/0}
+        graph_id (str, int): Set a single id for the whole Kloppy dataset.
+        graph_ids (dict): Frame level control over graph ids.
+
+        The graph_ids will be used to assign each graph an identifier. This identifier allows us to split the CustomSpektralDataset such that
+            all graphs with the same id are either all in the test, train or validation set to avoid leakage. It is recommended to either set graph_id (int, str) as
+            a match_id, or pass a dictionary into 'graph_ids' with exactly the same keys as 'labels' for more granualar control over the graph ids.
+        The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}.
+        Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None.
+
+        infer_ball_ownership (bool):
+            Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz.
+            Also infers ball_carrier within ball_carrier_threshold
+        infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame
+        ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0.
+        boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None.
+        non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players
+    """
+
+    dataset: KloppyDataset = None
+
+    label_col: str = "label"
+    graph_id_col: str = "graph_id"
+
+    chunk_size: int = 2_0000
+
+    infer_goalkeepers: bool = True
+    infer_ball_ownership: bool = True
+    boundary_correction: float = None
+    ball_carrier_treshold: float = 25.0
+
+    non_potential_receiver_node_value: float = 0.1
+
+    def __post_init__(self):
+        self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions
+        self.dataset = self.dataset.data
+
+        self._sport_specific_checks()
+        self.settings = self._apply_settings()
+        self.dataset = self._apply_filters()
+
+    def _apply_filters(self):
+        return self.dataset.with_columns(
+            pl.when(
+                (pl.col(self.settings._identifier_column) == self.settings.ball_id)
+                & (pl.col("v") > self.settings.max_ball_speed)
+            )
+            .then(self.settings.max_ball_speed)
+            .when(
+                (pl.col(self.settings._identifier_column) != self.settings.ball_id)
+                & (pl.col("v") > self.settings.max_player_speed)
+            )
+            .then(self.settings.max_player_speed)
+            .otherwise(pl.col("v"))
+            .alias("v")
+        ).with_columns(
+            pl.when(
+                (pl.col(self.settings._identifier_column) == self.settings.ball_id)
+                & (pl.col("a") > self.settings.max_ball_acceleration)
+            )
+            .then(self.settings.max_ball_acceleration)
+            .when(
+                (pl.col(self.settings._identifier_column) != self.settings.ball_id)
+                & (pl.col("a") > self.settings.max_player_acceleration)
+            )
+            .then(self.settings.max_player_acceleration)
+            .otherwise(pl.col("a"))
+            .alias("a")
+        )
+
+    def _apply_settings(self):
+        return GraphSettingsPL(
+            pitch_dimensions=self.pitch_dimensions,
+            ball_carrier_treshold=self.ball_carrier_treshold,
+            max_player_speed=self.max_player_speed,
+            max_ball_speed=self.max_ball_speed,
+            max_player_acceleration=self.max_player_acceleration,
+            max_ball_acceleration=self.max_ball_acceleration,
+            boundary_correction=self.boundary_correction,
+            self_loop_ball=self.self_loop_ball,
+            adjacency_matrix_connect_type=self.adjacency_matrix_connect_type,
+            adjacency_matrix_type=self.adjacency_matrix_type,
+            label_type=self.label_type,
+            infer_ball_ownership=self.infer_ball_ownership,
+            infer_goalkeepers=self.infer_goalkeepers,
+            defending_team_node_value=self.defending_team_node_value,
+            non_potential_receiver_node_value=self.non_potential_receiver_node_value,
+            random_seed=self.random_seed,
+            pad=self.pad,
+            verbose=self.verbose,
+        )
+
+    def _sport_specific_checks(self):
+        if not isinstance(self.label_col, str):
+            raise Exception("'label_col' should be of type string (str)")
+
+        if not isinstance(self.graph_id_col, str):
+            raise Exception("'graph_id_col' should be of type string (str)")
+
+        if not isinstance(self.chunk_size, int):
+            raise Exception("chunk_size should be of type integer (int)")
+
+        if not self.label_col in self.dataset.columns and not self.prediction:
+            raise Exception(
+                "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on."
+            )
+
+        if not self.graph_id_col in self.dataset.columns:
+            raise Exception(
+                "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
+            )
+
+        # Parameter Checks
+        if not isinstance(self.infer_goalkeepers, bool):
+            raise Exception("'infer_goalkeepers' should be of type boolean (bool)")
+
+        if not isinstance(self.infer_ball_ownership, bool):
+            raise Exception("'infer_ball_ownership' should be of type boolean (bool)")
+
+        if self.boundary_correction and not isinstance(self.boundary_correction, float):
+            raise Exception("'boundary_correction' should be of type float")
+
+        if self.ball_carrier_treshold and not isinstance(
+            self.ball_carrier_treshold, float
+        ):
+            raise Exception("'ball_carrier_treshold' should be of type float")
+
+        if self.non_potential_receiver_node_value and not isinstance(
+            self.non_potential_receiver_node_value, float
+        ):
+            raise Exception(
+                "'non_potential_receiver_node_value' should be of type float"
+            )
+
+    def _convert(self):
+        def __compute(args: List[pl.Series]) -> dict:
+            x = args[0].to_numpy()
+            y = args[1].to_numpy()
+            z = args[2].to_numpy()
+            v = args[3].to_numpy()
+            vx = args[4].to_numpy()
+            vy = args[5].to_numpy()
+            vz = args[6].to_numpy()
+            a = args[7].to_numpy()
+            ax = args[8].to_numpy()
+            ay = args[9].to_numpy()
+            az = args[10].to_numpy()
+
+            team_id = args[6].to_numpy()
+            position_name = args[7].to_numpy()
+            ball_owning_team_id = args[8].to_numpy()
+            graph_id = args[9].to_numpy()
+            label = args[10].to_numpy()
+
+            if not np.all(graph_id == graph_id[0]):
+                raise Exception(
+                    "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..."
+                )
+
+            if not self.prediction and not np.all(label == label[0]):
+                raise Exception(
+                    "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label."
+                )
+
+            ball_carrier_idx = get_ball_carrier_idx(
+                x=x,
+                y=y,
+                z=z,
+                team=team_id,
+                possession_team=ball_owning_team_id,
+                ball_id=self.settings.ball_id,
+                threshold=self.settings.ball_carrier_treshold,
+            )
+
+            adjacency_matrix = compute_adjacency_matrix_pl(
+                team=team_id,
+                possession_team=ball_owning_team_id,
+                settings=self.settings,
+                ball_carrier_idx=ball_carrier_idx,
+            )
+            edge_features = compute_edge_features_pl(
+                adjacency_matrix=adjacency_matrix,
+                p3d=np.stack((x, y, z), axis=-1),
+                p2d=np.stack((x, y), axis=-1),
+                s=v,
+                velocity=np.stack((vx, vy), axis=-1),
+                team=team_id,
+                settings=self.settings,
+            )
+            node_features = compute_node_features_pl(
+                x,
+                y,
+                s=v,
+                velocity=np.stack((vx, vy), axis=-1),
+                team=team_id,
+                possession_team=ball_owning_team_id,
+                is_gk=(position_name == self.settings.goalkeeper_id).astype(int),
+                settings=self.settings,
+            )
+            return {
+                "e": pl.Series(
+                    [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+                ),
+                "x": pl.Series(
+                    [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+                ),
+                "a": pl.Series(
+                    [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32))
+                ),
+                "e_shape_0": edge_features.shape[0],
+                "e_shape_1": edge_features.shape[1],
+                "x_shape_0": node_features.shape[0],
+                "x_shape_1": node_features.shape[1],
+                "a_shape_0": adjacency_matrix.shape[0],
+                "a_shape_1": adjacency_matrix.shape[1],
+                self.graph_id_col: graph_id[0],
+                self.label_col: label[0],
+            }
+
+        result_df = self.dataset.group_by(
+            ["game_id", "frame_id"], maintain_order=True
+        ).agg(
+            pl.map_groups(
+                exprs=[
+                    "x",
+                    "y",
+                    "z",
+                    "v",
+                    "vx",
+                    "vy",
+                    "vz",
+                    "a",
+                    "ax",
+                    "ay",
+                    "az",
+                    "team_id",
+                    "position_name",
+                    "ball_owning_team_id",
+                    self.graph_id_col,
+                    self.label_col,
+                ],
+                function=__compute,
+            ).alias("result_dict")
+        )
+
+        graph_df = result_df.with_columns(
+            [
+                pl.col("result_dict").struct.field("a").alias("a"),
+                pl.col("result_dict").struct.field("e").alias("e"),
+                pl.col("result_dict").struct.field("x").alias("x"),
+                pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"),
+                pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"),
+                pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"),
+                pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"),
+                pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"),
+                pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"),
+                pl.col("result_dict")
+                .struct.field(self.graph_id_col)
+                .alias(self.graph_id_col),
+                pl.col("result_dict")
+                .struct.field(self.label_col)
+                .alias(self.label_col),
+            ]
+        )
+
+        return graph_df.drop("result_dict")
+
+    def to_graph_frames(self) -> List[dict]:
+        def __convert_to_graph_data_list(df):
+            lazy_df = df.lazy()
+
+            graph_list = []
+
+            for chunk in lazy_df.collect().iter_slices(self.chunk_size):
+                chunk_graph_list = [
+                    {
+                        "a": make_sparse(
+                            flatten_to_reshaped_array(
+                                arr=chunk["a"][i],
+                                s0=chunk["a_shape_0"][i],
+                                s1=chunk["a_shape_1"][i],
+                            )
+                        ),
+                        "x": flatten_to_reshaped_array(
+                            arr=chunk["x"][i],
+                            s0=chunk["x_shape_0"][i],
+                            s1=chunk["x_shape_1"][i],
+                        ),
+                        "e": flatten_to_reshaped_array(
+                            arr=chunk["e"][i],
+                            s0=chunk["e_shape_0"][i],
+                            s1=chunk["e_shape_1"][i],
+                        ),
+                        "y": np.asarray([chunk[self.label_col][i]]),
+                        "id": chunk[self.graph_id_col][i],
+                    }
+                    for i in range(len(chunk["a"]))
+                ]
+                graph_list.extend(chunk_graph_list)
+
+            return graph_list
+
+        graph_df = self._convert()
+        self.graph_frames = __convert_to_graph_data_list(graph_df)
+
+        return self.graph_frames
+
+    def to_spektral_graphs(self) -> List[Graph]:
+        if not self.graph_frames:
+            self.to_graph_frames()
+
+        return [
+            Graph(
+                x=d["x"],
+                a=d["a"],
+                e=d["e"],
+                y=d["y"],
+                id=d["id"],
+            )
+            for d in self.graph_frames
+        ]
+
+    def to_pickle(self, file_path: str) -> None:
+        """
+        We store the 'dict' version of the Graphs to pickle each graph is now a dict with keys x, a, e, and y
+        To use for training with Spektral feed the loaded pickle data to CustomDataset(data=pickled_data)
+        """
+        if not file_path.endswith("pickle.gz"):
+            raise ValueError(
+                "Only compressed pickle files of type 'some_file_name.pickle.gz' are supported..."
+            )
+
+        if not self.graph_frames:
+            self.to_graph_frames()
+
+        import pickle
+        import gzip
+        from pathlib import Path
+
+        path = Path(file_path)
+
+        directories = path.parent
+        directories.mkdir(parents=True, exist_ok=True)
+
+        with gzip.open(file_path, "wb") as file:
+            pickle.dump(self.graph_frames, file)
diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py
new file mode 100644
index 0000000..0ef8dce
--- /dev/null
+++ b/unravel/soccer/graphs/graph_settings_pl.py
@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+
+from ...utils import DefaultGraphSettings
+
+from dataclasses import dataclass, field
+from kloppy.domain import Dimension, Unit, MetricPitchDimensions
+from typing import Optional
+
+
+@dataclass
+class GraphSettingsPL(DefaultGraphSettings):
+    ball_id: str = "ball"
+    goalkeeper_id: str = "GK"
+    infer_goalkeepers: bool = True
+    boundary_correction: float = None
+    non_potential_receiver_node_value: float = 0.1
+    ball_carrier_treshold: float = 25.0
+    pitch_dimensions: MetricPitchDimensions = field(
+        init=False, repr=False, default_factory=MetricPitchDimensions
+    )
+    _identifier_column: str = field(default="id", init=False)
+
+    def __post_init__(self):
+        self._sport_specific_checks()
+
+    @property
+    def pitch_dimensions(self) -> int:
+        return self._pitch_dimensions
+
+    @pitch_dimensions.setter
+    def pitch_dimensions(self, pitch_dimensions: MetricPitchDimensions) -> None:
+        self._pitch_dimensions = pitch_dimensions
+
+    def _sport_specific_checks(self):
+        if self.non_potential_receiver_node_value > 1:
+            self.non_potential_receiver_node_value = 1
+        elif self.non_potential_receiver_node_value < 0:
+            self.non_potential_receiver_node_value = 0
diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py
index d74c931..282fe61 100644
--- a/unravel/utils/features/utils.py
+++ b/unravel/utils/features/utils.py
@@ -69,6 +69,15 @@ def unit_vector(vector):
     return vector / norm
 
 
+def unit_vectors(vectors):
+    magnitudes = np.linalg.norm(vectors, axis=1, keepdims=True)
+
+    magnitudes[magnitudes == 0] = 1
+
+    unit_vectors = vectors / magnitudes
+    return unit_vectors
+
+
 def normalize_coords(value, max_value):
     return value / max_value
 
@@ -172,3 +181,35 @@ def flatten_to_reshaped_array(arr, s0, s1, as_list=False):
     # Concatenate the arrays into one single array
     result_array = np.concatenate(flattened_list).reshape(s0, s1)
     return result_array if not as_list else result_array.tolist()
+
+
+def distance_to_ball(
+    x: np.array, y: np.array, team: np.array, ball_id: str, z: np.array = None
+):
+    if z is not None:
+        position = np.stack((x, y, z), axis=-1)
+    else:
+        position = np.stack((x, y), axis=-1)
+    if np.where(team == ball_id)[0].size >= 1:
+        ball_index = np.where(team == ball_id)[0]
+        ball_position = position[ball_index][0]
+    else:
+        if z is not None:
+            ball_position = np.asarray([0.0, 0.0, 0.0])
+        else:
+            ball_position = np.asarray([0.0, 0.0])
+    dist_to_ball = np.linalg.norm(position - ball_position, axis=1)
+    return position, ball_position, dist_to_ball
+
+
+def get_ball_carrier_idx(x, y, z, team, possession_team, ball_id, threshold):
+    _, _, dist_to_ball = distance_to_ball(x=x, y=y, z=z, team=team, ball_id=ball_id)
+
+    filtered_distances = np.where(
+        (team != possession_team) | (dist_to_ball <= threshold), np.inf, dist_to_ball
+    )
+
+    ball_carrier_idx = (
+        np.argmin(filtered_distances) if np.isfinite(filtered_distances).any() else None
+    )
+    return ball_carrier_idx
diff --git a/unravel/utils/objects/__init__.py b/unravel/utils/objects/__init__.py
index 3940548..4299b16 100644
--- a/unravel/utils/objects/__init__.py
+++ b/unravel/utils/objects/__init__.py
@@ -5,3 +5,4 @@
 from .default_graph_frame import DefaultGraphFrame
 from .default_graph_settings import DefaultGraphSettings
 from .default_graph_converter import DefaultGraphConverter
+from .default_dataset import DefaultDataset
diff --git a/unravel/utils/objects/default_dataset.py b/unravel/utils/objects/default_dataset.py
new file mode 100644
index 0000000..b31280e
--- /dev/null
+++ b/unravel/utils/objects/default_dataset.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class DefaultDataset:
+    def load(self):
+        raise NotImplementedError()
+
+    def add_dummy_labels(self):
+        raise NotImplementedError()
+
+    def add_graph_ids(self):
+        raise NotImplementedError()

From 30a6775083d81f2d097a776a8e3b35cb40c7320e Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sat, 25 Jan 2025 11:39:18 +0100
Subject: [PATCH 02/10] polars, working

---
 examples/2_big_data_bowl_guide.ipynb          |   2 +-
 tests/test_kloppy_polars.py                   | 217 +++++++++++
 unravel/soccer/graphs/__init__.py             |   6 +-
 unravel/soccer/graphs/dataset.py              | 221 ++++++++++--
 unravel/soccer/graphs/graph_converter_pl.py   | 337 ++++++++++--------
 unravel/soccer/graphs/graph_settings_pl.py    |   3 +-
 .../utils/objects/default_graph_settings.py   |   3 -
 7 files changed, 615 insertions(+), 174 deletions(-)
 create mode 100644 tests/test_kloppy_polars.py

diff --git a/examples/2_big_data_bowl_guide.ipynb b/examples/2_big_data_bowl_guide.ipynb
index faf20ea..b6ed01c 100644
--- a/examples/2_big_data_bowl_guide.ipynb
+++ b/examples/2_big_data_bowl_guide.ipynb
@@ -218,7 +218,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py
new file mode 100644
index 0000000..08e3e6a
--- /dev/null
+++ b/tests/test_kloppy_polars.py
@@ -0,0 +1,217 @@
+from pathlib import Path
+from unravel.soccer import (
+    SoccerGraphConverterPolars, 
+    KloppyPolarsDataset
+)
+from unravel.utils import (
+    dummy_labels,
+    dummy_graph_ids,
+    CustomSpektralDataset,
+)
+
+from kloppy import skillcorner
+from kloppy.domain import Ground, TrackingDataset, Orientation
+from typing import List, Dict
+
+from spektral.data import Graph
+
+import pytest
+
+import numpy as np
+
+
+class TestKloppyPolarsData:
+    @pytest.fixture
+    def match_data(self, base_dir: Path) -> str:
+        return base_dir / "files" / "skillcorner_match_data.json"
+
+    @pytest.fixture
+    def structured_data(self, base_dir: Path) -> str:
+        return base_dir / "files" / "skillcorner_structured_data.json.gz"
+
+    @pytest.fixture()
+    def kloppy_dataset(self, match_data: str, structured_data: str) -> TrackingDataset:
+        return skillcorner.load(
+            raw_data=structured_data,
+            meta_data=match_data,
+            coordinates="tracab",
+            include_empty_frames=False,
+            limit=500,
+        )
+        
+    @pytest.fixture()
+    def kloppy_polars_dataset(self, kloppy_dataset: TrackingDataset) -> KloppyPolarsDataset:
+        dataset = KloppyPolarsDataset(
+            kloppy_dataset=kloppy_dataset,
+            ball_carrier_threshold=25.0,
+        )
+        dataset.load()
+        dataset.add_dummy_labels(
+            by=["game_id", "frame_id"]
+        )
+        dataset.add_graph_ids(
+            by=["game_id", "frame_id"]
+        )
+        return dataset
+    
+    @pytest.fixture()
+    def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars:
+        return SoccerGraphConverterPolars(
+            dataset=kloppy_polars_dataset,
+            chunk_size=2_0000,
+            non_potential_receiver_node_value=0.1,
+            max_player_speed=12.0,
+            max_player_acceleration=12.0,
+            max_ball_speed=13.5,
+            max_ball_acceleration=100,
+            self_loop_ball=True,
+            adjacency_matrix_connect_type="ball",
+            adjacency_matrix_type="split_by_team",
+            label_type="binary",
+            defending_team_node_value=0.0,
+            random_seed=False,
+            pad=True,
+            verbose=False,
+        )
+
+    @pytest.fixture()
+    def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars:
+        # TODO: 
+        # check if 
+        # - random_seed
+        # - padding needs to be per team_id otherwise stuff breaks
+        # all work as expected and/or should be moved to the KloppyPolarsDataset
+        
+        return SoccerGraphConverterPolars(
+            dataset=kloppy_polars_dataset,
+            chunk_size=2_0000,
+            non_potential_receiver_node_value=0.1,
+            max_player_speed=12.0,
+            max_player_acceleration=12.0,
+            max_ball_speed=13.5,
+            max_ball_acceleration=100,
+            self_loop_ball=True,
+            adjacency_matrix_connect_type="ball",
+            adjacency_matrix_type="split_by_team",
+            label_type="binary",
+            defending_team_node_value=0.0,
+            random_seed=False,
+            pad=False,
+            verbose=False,
+        )
+
+    # @pytest.fixture()
+    # def gnnc_padding_random(self, dataset: TrackingDataset) -> SoccerGraphConverter:
+    #     return SoccerGraphConverter(
+    #         dataset=dataset,
+    #         labels=dummy_labels(dataset),
+    #         # settings
+    #         ball_carrier_treshold=25.0,
+    #         max_player_speed=12.0,
+    #         max_ball_speed=28.0,
+    #         boundary_correction=None,
+    #         self_loop_ball=False,
+    #         adjacency_matrix_connect_type="ball",
+    #         adjacency_matrix_type="split_by_team",
+    #         label_type="binary",
+    #         defending_team_node_value=0.0,
+    #         non_potential_receiver_node_value=0.1,
+    #         infer_ball_ownership=True,
+    #         infer_goalkeepers=True,
+    #         random_seed=42,
+    #         pad=True,
+    #         verbose=False,
+    #     )
+    def test_padding(self, spc_padding: SoccerGraphConverterPolars):
+        """
+        Test navigating (next/prev) through events
+        """
+        spektral_graphs = spc_padding.to_spektral_graphs()
+
+        assert 1 == 1
+
+        data = spektral_graphs
+        assert len(data) == 384
+        assert isinstance(data[0], Graph)
+
+    def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPolars):
+        """
+        Test navigating (next/prev) through events
+        """
+        spektral_graphs = soccer_polars_converter.to_spektral_graphs()
+
+        assert 1 == 1
+        
+        data = spektral_graphs
+        assert len(data) == 489
+        assert isinstance(data[0], Graph)
+
+        x = data[0].x
+        assert x.shape == (10, 12)
+        assert 0.31373436337428573 == pytest.approx(x[0, 0], abs=1e-5)
+        assert 0.06765375015355701 == pytest.approx(x[0, 4], abs=1e-5)
+        assert 0.47729475229688306 == pytest.approx(x[8, 2], abs=1e-5)
+
+        e = data[0].e
+        assert e.shape == (60, 6)
+        assert 0.0 == pytest.approx(e[0, 0], abs=1e-5)
+        assert 0.5 == pytest.approx(e[0, 4], abs=1e-5)
+        assert 0.579979482018554 == pytest.approx(e[8, 2], abs=1e-5)
+
+        a = data[0].a
+        assert a.shape == (10, 10)
+        assert 1.0 == pytest.approx(a[0, 0], abs=1e-5)
+        assert 1.0 == pytest.approx(a[0, 4], abs=1e-5)
+        assert 0.0 == pytest.approx(a[8, 2], abs=1e-5)
+
+        dataset = CustomSpektralDataset(graphs=spektral_graphs)
+        N, F, S, n_out, n = dataset.dimensions()
+        assert N == 20
+        assert F == 12
+        assert S == 6
+        assert n_out == 1
+        assert n == 489
+
+        train, test, val = dataset.split_test_train_validation(
+            split_train=4,
+            split_test=1,
+            split_validation=1,
+            by_graph_id=True,
+            random_seed=42,
+        )
+        assert train.n_graphs == 326
+        assert test.n_graphs == 81
+        assert val.n_graphs == 82
+
+        train, test, val = dataset.split_test_train_validation(
+            split_train=4,
+            split_test=1,
+            split_validation=1,
+            by_graph_id=False,
+            random_seed=42,
+        )
+        assert train.n_graphs == 326
+        assert test.n_graphs == 81
+        assert val.n_graphs == 82
+
+        train, test = dataset.split_test_train(
+            split_train=4, split_test=1, by_graph_id=False, random_seed=42
+        )
+        assert train.n_graphs == 391
+        assert test.n_graphs == 98
+
+        train, test = dataset.split_test_train(
+            split_train=4, split_test=5, by_graph_id=False, random_seed=42
+        )
+        assert train.n_graphs == 217
+        assert test.n_graphs == 272
+
+        with pytest.raises(
+            NotImplementedError,
+            match="Make sure split_train > split_test >= split_validation, other behaviour is not supported when by_graph_id is True...",
+        ):
+            dataset.split_test_train(
+                split_train=4, split_test=5, by_graph_id=True, random_seed=42
+            )
+
+    
diff --git a/unravel/soccer/graphs/__init__.py b/unravel/soccer/graphs/__init__.py
index bd44fac..2991890 100644
--- a/unravel/soccer/graphs/__init__.py
+++ b/unravel/soccer/graphs/__init__.py
@@ -1,9 +1,9 @@
 from .graph_converter import SoccerGraphConverter
-from .graph_converter_pl import SoccerGraphConverterPL
+from .graph_converter_pl import SoccerGraphConverterPolars
 from .graph_settings import SoccerGraphSettings
-from .graph_settings_pl import GraphSettingsPL
+from .graph_settings_pl import GraphSettingsPolars
 from .graph_frame import GraphFrame
 from .exceptions import *
 from .features import *
 
-from .dataset import KloppyDataset
+from .dataset import KloppyPolarsDataset
diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py
index 459b1d6..b9c4243 100644
--- a/unravel/soccer/graphs/dataset.py
+++ b/unravel/soccer/graphs/dataset.py
@@ -16,8 +16,8 @@
 import polars as pl
 
 
-DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 2}
-DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 2}
+DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 1}
+DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 1}
 
 
 @dataclass
@@ -28,19 +28,27 @@ class SoccerObject:
 
 
 @dataclass
-class KloppyDataset(DefaultDataset):
+class KloppyPolarsDataset(DefaultDataset):
     kloppy_dataset: TrackingDataset
+    ball_carrier_threshold: float = None
     _identifier_column: str = field(default="id", init=False)
+    _graph_id_column: str = field(default="graph_id")
+    _label_column: str = field(default="label")
     _partition_by: List[str] = field(
         default_factory=lambda: ["id", "period_id"], init=False
     )
-
+    _infer_ball_owning_team_id: bool = field(default=False, init=False)
+    _overwrite_orientation: bool = field(default=False, init=False)
+    _infer_goalkeepers: bool = field(default=False, init=False)
+    
     def __transform_orientation(self):
         if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM:
+            self._overwrite_orientation = True
+            # In this package attacking is always left to right, so if this is not giving in Kloppy, overwrite it
             to_orientation = Orientation.STATIC_HOME_AWAY
         else:
             to_orientation = Orientation.BALL_OWNING_TEAM
-
+            
         self.kloppy_dataset = DatasetTransformer.transform_dataset(
             dataset=self.kloppy_dataset,
             to_orientation=to_orientation,
@@ -52,18 +60,35 @@ def __transform_orientation(self):
         return self.kloppy_dataset
 
     def __get_objects(self):
+        def __artificial_game_id() -> str:
+            from uuid import uuid4
+            return str(uuid4())
+        
         home_team, away_team = self.kloppy_dataset.metadata.teams
-
-        home_players = [
-            SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
-            for p in home_team.players
-        ]
-        away_players = [
-            SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
-            for p in away_team.players
-        ]
-        ball_object = SoccerObject("ball", None, "ball")
+        
+        if all(item is None for item in [p.starting_position for p in home_team.players]):
+            self._infer_goalkeepers = True
+            home_players = [
+                SoccerObject(p.player_id, p.team.team_id, None)
+                for p in home_team.players
+            ]
+            away_players = [
+                SoccerObject(p.player_id, p.team.team_id, None)
+                for p in away_team.players
+            ]
+        else:
+            home_players = [
+                SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
+                for p in home_team.players
+            ]
+            away_players = [
+                SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
+                for p in away_team.players
+            ]
+        ball_object = SoccerObject("ball", "ball", "ball")
         game_id = self.kloppy_dataset.metadata.game_id
+        if game_id is None:
+            game_id = __artificial_game_id()
         return (home_players, away_players, ball_object, game_id)
 
     def __unpivot(self, object, coordinate):
@@ -271,6 +296,10 @@ def __melt(
                     continue
 
                 melted_df = self.__unpivot(object, coordinate)
+                
+                if object.id == "ball" and coordinate == "z":
+                    if melted_df[coordinate].is_null().all():
+                        melted_df = melted_df.with_columns([pl.lit(0.0).alias("z")])
                 if k == 0:
                     melted_object_dfs.append(melted_df)
                 else:
@@ -288,17 +317,145 @@ def __melt(
                 )
 
                 melted_dfs.append(object_df)
-
+        
         df = pl.concat(melted_dfs, how="vertical")
         df = df.with_columns([pl.lit(game_id).alias("game_id")])
         df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True)
         return df
+    
+    def __get_inferred_ball_owning_team_id(self, df: pl.DataFrame):
+        non_ball_owning_team = (
+            df.filter(pl.col("ball_owning_team_id").is_null())
+        )
+        ball_owning_team = (
+            df.filter(~pl.col("ball_owning_team_id").is_null())
+        )
+        
+        ball = (
+            non_ball_owning_team.filter(pl.col('team_id') == "ball")
+        )
+        players = (
+            non_ball_owning_team.filter(pl.col('team_id') != "ball")
+        )
+        result = (
+            players.drop('ball_owning_team_id')
+            .join(
+                ball.select(
+                    ['game_id', 'period_id', 'frame_id', 
+                    pl.col('x').alias('ball_x'),
+                    pl.col('y').alias('ball_y'), 
+                    pl.col('z').alias('ball_z')]
+                ),
+                on=['game_id', 'period_id', 'frame_id'],
+                how='left'
+            )
+            .with_columns([
+                ((pl.col('x') - pl.col('ball_x'))**2 + 
+                (pl.col('y') - pl.col('ball_y'))**2 + 
+                (pl.col('z') - pl.col('ball_z'))**2
+                ).sqrt().alias('distance')
+            ])
+            .group_by(['game_id', 'period_id', 'frame_id'])
+            .agg([
+                pl.when(pl.col('distance').min() < self.ball_carrier_threshold)
+                .then(pl.col('team_id').filter(pl.col('distance') == pl.col('distance').min()).first())
+                .otherwise(None)
+                .alias('ball_owning_team_id'),
+                pl.all().sort_by('distance').first()
+            ])
+        )
+        non_ball_owning_team = (
+            non_ball_owning_team.drop('ball_owning_team_id')
+            .join(
+                result.select(['game_id', 'period_id', 'frame_id', 'ball_owning_team_id']),
+                on=['game_id', 'period_id', 'frame_id'],
+                how='left'
+            )
+            .filter(
+                ~pl.col("ball_owning_team_id").is_null()
+            )
+            .with_columns([
+                pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id'])
+            ])
+            .select(ball_owning_team.columns)
+        )
+        ball_owning_team = (
+            ball_owning_team
+            .with_columns([
+                pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id'])
+            ])
+        )
+        
+        new_df = (
+            pl.concat([
+                ball_owning_team,
+                non_ball_owning_team
+            ], how="vertical")
+            .sort(['game_id', 'period_id', 'frame_id', 'team_id'])
+        )
+        return new_df
+    
+    def __get_inferred_goalkeepers(self, df: pl.DataFrame):
+        goal_x = self.pitch_dimensions.pitch_length / 2
+        goal_y = 0
+        
+        df_with_distances = (
+            df.filter(pl.col('team_id') != "ball")
+            .with_columns([
+                ((pl.col('x') - (-goal_x))**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_left'),
+                ((pl.col('x') - goal_x)**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_right')
+            ])
+        )
+        result = (
+            df_with_distances
+            .with_columns([
+                pl.col('dist_left').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_left'),
+                pl.col('dist_right').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_right')
+            ])
+            .with_columns([
+                pl.when(pl.col('team_id') == pl.col('ball_owning_team_id'))
+                .then(
+                    pl.when(pl.col('dist_left') == pl.col('min_dist_left'))
+                    .then(pl.lit('GK'))
+                    .otherwise(None)
+                )
+                .otherwise(
+                    pl.when(pl.col('dist_right') == pl.col('min_dist_right'))
+                    .then(pl.lit('GK'))
+                    .otherwise(None)
+                )
+                .alias('position_name')
+            ])
+            .drop(['min_dist_left', 'min_dist_right', 'dist_left', 'dist_right'])
+        )
+        ball_rows = df.filter(pl.col('team_id') == "ball")
+        non_ball_rows = result
+
+        return (
+            pl.concat([ball_rows, non_ball_rows], how="vertical")
+            .sort(['game_id', 'period_id', 'frame_id', 'team_id'])
+        )
+        
+    def __fix_orientation_to_ball_owning(self, df: pl.DataFrame, home_team_id: Union[str, int]):
+        # When _overwrite_orientation is True, it means the orientation is "STATIC_HOME_AWAY"
+        # This means that when away is the attacking team we can flip all coordinates by -1.0
+        
+        flip_columns = ['x', 'y', 'vx', 'vy', 'ax', 'ay']
+        
+        return df.with_columns([
+            pl.when(pl.col('ball_owning_team_id').cast(str) != str(home_team_id))
+            .then(pl.col(flip_columns) * -1)
+            .otherwise(pl.col(flip_columns))
+        ])
 
     def load(
         self,
         player_smoothing_params: Union[dict, None] = DEFAULT_PLAYER_SMOOTHING_PARAMS,
         ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS,
     ):
+        if self.kloppy_dataset.metadata.orientation == Orientation.NOT_SET:
+            raise ValueError("Data sources with an undefined orientation can not be used inside the 'unravelsports' package...")
+        
         self.kloppy_dataset = self.__transform_orientation()
         self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions
 
@@ -309,22 +466,40 @@ def load(
         df = self.__melt(
             self._home_players, self._away_players, self._ball_object, self._game_id
         )
+        
         df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params)
         df = self.__add_acceleration(df)
-        self.data = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"])
-
+        df = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"])
+        
+        df = df.filter(
+            ~(pl.col('x').is_null() & pl.col('y').is_null())
+        )
+        
+        if df['ball_owning_team_id'].is_null().all() and self.ball_carrier_threshold:
+                raise ValueError("This dataset requires us to infer the ball_owning_team_id, please specifiy a ball_carrier_threshold (float) to do so.")
+        
+        if self.ball_carrier_threshold is not None:
+            df = self.__get_inferred_ball_owning_team_id(df)
+            
+        if self._overwrite_orientation:
+            home_team, _ = self.kloppy_dataset.metadata.teams
+            df = self.__fix_orientation_to_ball_owning(df, home_team_id=home_team.team_id)
+        
+        if self._infer_goalkeepers:
+            df = self.__get_inferred_goalkeepers(df)
+        
+        self.data = df
         return self.data, self.pitch_dimensions
 
     def add_dummy_labels(
         self,
-        by: List[str] = ["game_id", "frame_id"],
-        column_name: str = "label",
+        by: List[str] = ["game_id", "frame_id"]
     ) -> pl.DataFrame:
-        self.data = add_dummy_label_column(self.data, by, column_name)
+        self.data = add_dummy_label_column(self.data, by, self._label_column)
         return self.data
 
     def add_graph_ids(
-        self, by: List[str] = ["game_id", "period_id"], column_name: str = "graph_id"
+        self, by: List[str] = ["game_id", "period_id"]
     ) -> pl.DataFrame:
-        self.data = add_graph_id_column(self.data, by, column_name)
+        self.data = add_graph_id_column(self.data, by, self._graph_id_column)
         return self.data
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index ad694f4..252ff03 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -8,7 +8,7 @@
 
 from dataclasses import dataclass, field, asdict
 
-from typing import List, Union, Dict, Literal
+from typing import List, Union, Dict, Literal, Any
 
 from kloppy.domain import (
     TrackingDataset,
@@ -29,8 +29,8 @@
     KeyMismatchError,
 )
 
-from .graph_settings_pl import GraphSettingsPL
-from .dataset import KloppyDataset
+from .graph_settings_pl import GraphSettingsPolars
+from .dataset import KloppyPolarsDataset
 from .features import (
     compute_node_features_pl,
     compute_adjacency_matrix_pl,
@@ -46,7 +46,7 @@
 
 
 @dataclass(repr=True)
-class SoccerGraphConverterPL(DefaultGraphConverter):
+class SoccerGraphConverterPolars(DefaultGraphConverter):
     """
     Converts our dataset TrackingDataset into an internal structure
 
@@ -62,36 +62,124 @@ class SoccerGraphConverterPL(DefaultGraphConverter):
         The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}.
         Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None.
 
-        infer_ball_ownership (bool):
-            Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz.
-            Also infers ball_carrier within ball_carrier_threshold
-        infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame
         ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0.
-        boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None.
         non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players
     """
 
-    dataset: KloppyDataset = None
-
-    label_col: str = "label"
-    graph_id_col: str = "graph_id"
+    dataset: KloppyPolarsDataset = None
 
     chunk_size: int = 2_0000
-
-    infer_goalkeepers: bool = True
-    infer_ball_ownership: bool = True
-    boundary_correction: float = None
-    ball_carrier_treshold: float = 25.0
-
     non_potential_receiver_node_value: float = 0.1
 
     def __post_init__(self):
         self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions
+        self.label_col = self.dataset._label_column
+        self.graph_id_col = self.dataset._graph_id_column
+        
+        self.ball_carrier_threshold = self.dataset.ball_carrier_threshold
         self.dataset = self.dataset.data
 
         self._sport_specific_checks()
         self.settings = self._apply_settings()
         self.dataset = self._apply_filters()
+        
+        if self.pad:
+            self.dataset = self._apply_padding(df=self.dataset)
+    
+    @staticmethod   
+    def _apply_padding(df: pl.DataFrame) -> pl.DataFrame:
+        keep_columns = [
+            'timestamp',
+            'ball_state',
+            'position_name',
+            'label',
+            'graph_id'
+        ]
+        empty_columns = [
+            'id', 'x', 'y', 'z', 'vx', 'vy',
+            'vz', 'v', 'ax', 'ay', 'az', 'a'
+        ]
+        group_by_columns = ['game_id', 'period_id', 'frame_id', 'team_id', 'ball_owning_team_id']
+        
+        counts = (
+            df.group_by(group_by_columns)
+            .agg(
+                pl.len().alias('count'),
+                *[pl.first(col).alias(col) for col in keep_columns]
+            )
+        )
+        
+        counts = counts.with_columns([
+            pl.when(pl.col('team_id') == "ball")
+            .then(1)
+            .when(pl.col('team_id') == pl.col('ball_owning_team_id'))
+            .then(11)
+            .otherwise(11)
+            .alias('target_length')
+        ])
+        
+        groups_to_pad = (
+            counts
+            .filter(pl.col('count') < pl.col('target_length'))
+            .with_columns(
+                (pl.col('target_length') - pl.col('count')).alias('repeats')
+            )
+        )
+        
+        if len(groups_to_pad) == 0:
+            return df
+            
+        padding_rows = []
+        for row in groups_to_pad.iter_rows(named=True):
+            base_row = {col: row[col] for col in keep_columns + group_by_columns}
+            padding_rows.extend([base_row] * row['repeats'])
+        
+        padding_df = pl.DataFrame(padding_rows)
+        
+        schema = df.schema
+        padding_df = padding_df.with_columns([
+            pl.lit(0.0 if schema[col] != pl.String else "None").cast(schema[col]).alias(col)
+            for col in empty_columns
+        ])
+        
+        padding_df = padding_df.select(df.columns)
+        
+        result = pl.concat([df, padding_df], how='vertical')
+        
+        total_frames = (
+            result.select(['game_id', 'period_id', 'frame_id'])
+            .unique()
+            .height
+        )
+        
+        frame_completeness = (
+            result.group_by(['game_id', 'period_id', 'frame_id'])
+            .agg([
+                (pl.col('team_id').eq("ball").sum() == 1).alias('has_ball'),
+                (pl.col('team_id').eq(pl.col('ball_owning_team_id')).sum() == 11).alias('has_owning_team'),
+                ((~pl.col('team_id').eq("ball") & ~pl.col('team_id').eq(pl.col('ball_owning_team_id'))).sum() == 11).alias('has_other_team')
+            ])
+            .filter(
+                pl.col('has_ball') & pl.col('has_owning_team') & pl.col('has_other_team')
+            )
+        )
+        
+        complete_frames = frame_completeness.height
+        
+        dropped_frames = total_frames - complete_frames
+        if dropped_frames > 0:
+            import warnings
+            warnings.warn(
+                f"""Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.
+                This operation dropped {dropped_frames} incomplete frames out of {total_frames} total frames ({(dropped_frames/total_frames)*100:.2f}%)
+                """
+            )
+        
+        return result.join(
+            frame_completeness,
+            on=['game_id', 'period_id', 'frame_id'],
+            how='inner'
+        )
 
     def _apply_filters(self):
         return self.dataset.with_columns(
@@ -123,20 +211,17 @@ def _apply_filters(self):
         )
 
     def _apply_settings(self):
-        return GraphSettingsPL(
+        return GraphSettingsPolars(
             pitch_dimensions=self.pitch_dimensions,
-            ball_carrier_treshold=self.ball_carrier_treshold,
+            ball_carrier_treshold=self.ball_carrier_threshold,
             max_player_speed=self.max_player_speed,
             max_ball_speed=self.max_ball_speed,
             max_player_acceleration=self.max_player_acceleration,
             max_ball_acceleration=self.max_ball_acceleration,
-            boundary_correction=self.boundary_correction,
             self_loop_ball=self.self_loop_ball,
             adjacency_matrix_connect_type=self.adjacency_matrix_connect_type,
             adjacency_matrix_type=self.adjacency_matrix_type,
             label_type=self.label_type,
-            infer_ball_ownership=self.infer_ball_ownership,
-            infer_goalkeepers=self.infer_goalkeepers,
             defending_team_node_value=self.defending_team_node_value,
             non_potential_receiver_node_value=self.non_potential_receiver_node_value,
             random_seed=self.random_seed,
@@ -164,20 +249,10 @@ def _sport_specific_checks(self):
                 "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
             )
 
-        # Parameter Checks
-        if not isinstance(self.infer_goalkeepers, bool):
-            raise Exception("'infer_goalkeepers' should be of type boolean (bool)")
-
-        if not isinstance(self.infer_ball_ownership, bool):
-            raise Exception("'infer_ball_ownership' should be of type boolean (bool)")
-
-        if self.boundary_correction and not isinstance(self.boundary_correction, float):
-            raise Exception("'boundary_correction' should be of type float")
-
-        if self.ball_carrier_treshold and not isinstance(
-            self.ball_carrier_treshold, float
+        if self.ball_carrier_threshold and not isinstance(
+            self.ball_carrier_threshold, float
         ):
-            raise Exception("'ball_carrier_treshold' should be of type float")
+            raise Exception("'ball_carrier_threshold' should be of type float")
 
         if self.non_potential_receiver_node_value and not isinstance(
             self.non_potential_receiver_node_value, float
@@ -185,115 +260,91 @@ def _sport_specific_checks(self):
             raise Exception(
                 "'non_potential_receiver_node_value' should be of type float"
             )
-
-    def _convert(self):
-        def __compute(args: List[pl.Series]) -> dict:
-            x = args[0].to_numpy()
-            y = args[1].to_numpy()
-            z = args[2].to_numpy()
-            v = args[3].to_numpy()
-            vx = args[4].to_numpy()
-            vy = args[5].to_numpy()
-            vz = args[6].to_numpy()
-            a = args[7].to_numpy()
-            ax = args[8].to_numpy()
-            ay = args[9].to_numpy()
-            az = args[10].to_numpy()
-
-            team_id = args[6].to_numpy()
-            position_name = args[7].to_numpy()
-            ball_owning_team_id = args[8].to_numpy()
-            graph_id = args[9].to_numpy()
-            label = args[10].to_numpy()
-
-            if not np.all(graph_id == graph_id[0]):
-                raise Exception(
-                    "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..."
-                )
-
-            if not self.prediction and not np.all(label == label[0]):
-                raise Exception(
-                    "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label."
-                )
-
-            ball_carrier_idx = get_ball_carrier_idx(
-                x=x,
-                y=y,
-                z=z,
-                team=team_id,
-                possession_team=ball_owning_team_id,
-                ball_id=self.settings.ball_id,
-                threshold=self.settings.ball_carrier_treshold,
+            
+    @property
+    def __exprs_variables(self):
+        return [
+            "x", "y", "z",
+            "v", "vx", "vy", "vz",
+            "a", "ax", "ay", "az",
+            "team_id", "position_name", "ball_owning_team_id",
+            self.graph_id_col,
+            self.label_col,
+        ]
+    
+    def __compute(self, args: List[pl.Series]) -> dict:
+        d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
+        
+        if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]):
+            raise Exception(
+                "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
 
-            adjacency_matrix = compute_adjacency_matrix_pl(
-                team=team_id,
-                possession_team=ball_owning_team_id,
-                settings=self.settings,
-                ball_carrier_idx=ball_carrier_idx,
-            )
-            edge_features = compute_edge_features_pl(
-                adjacency_matrix=adjacency_matrix,
-                p3d=np.stack((x, y, z), axis=-1),
-                p2d=np.stack((x, y), axis=-1),
-                s=v,
-                velocity=np.stack((vx, vy), axis=-1),
-                team=team_id,
-                settings=self.settings,
-            )
-            node_features = compute_node_features_pl(
-                x,
-                y,
-                s=v,
-                velocity=np.stack((vx, vy), axis=-1),
-                team=team_id,
-                possession_team=ball_owning_team_id,
-                is_gk=(position_name == self.settings.goalkeeper_id).astype(int),
-                settings=self.settings,
+        if not self.prediction and not np.all(d[self.label_col] == d[self.label_col][0]):
+            raise Exception(
+                """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
+                make sure this is not the case. Each group can only have 1 label."""
             )
-            return {
-                "e": pl.Series(
-                    [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
-                ),
-                "x": pl.Series(
-                    [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
-                ),
-                "a": pl.Series(
-                    [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32))
-                ),
-                "e_shape_0": edge_features.shape[0],
-                "e_shape_1": edge_features.shape[1],
-                "x_shape_0": node_features.shape[0],
-                "x_shape_1": node_features.shape[1],
-                "a_shape_0": adjacency_matrix.shape[0],
-                "a_shape_1": adjacency_matrix.shape[1],
-                self.graph_id_col: graph_id[0],
-                self.label_col: label[0],
-            }
-
+        
+        ball_carrier_idx = get_ball_carrier_idx(
+            x=d['x'], y=d['y'], z=d['z'],
+            team=d['team_id'],
+            possession_team=d['ball_owning_team_id'],
+            ball_id=self.settings.ball_id,
+            threshold=self.settings.ball_carrier_treshold,
+        )
+        adjacency_matrix = compute_adjacency_matrix_pl(
+            team=d['team_id'],
+            possession_team=d['ball_owning_team_id'],
+            settings=self.settings,
+            ball_carrier_idx=ball_carrier_idx,
+        )
+        edge_features = compute_edge_features_pl(
+            adjacency_matrix=adjacency_matrix,
+            p3d=np.stack((d['x'], d['y'], d['z']), axis=-1),
+            p2d=np.stack((d['x'], d['y']), axis=-1),
+            s=d['v'],
+            velocity=np.stack((d['vx'], d['vy']), axis=-1),
+            team=d['team_id'],
+            settings=self.settings,
+        )
+        node_features = compute_node_features_pl(
+            d['x'],
+            d['y'],
+            s=d['v'],
+            velocity=np.stack((d['vx'], d['vy']), axis=-1),
+            team=d['team_id'],
+            possession_team=d['ball_owning_team_id'],
+            is_gk=(d['position_name'] == self.settings.goalkeeper_id).astype(int),
+            settings=self.settings,
+        )
+        return {
+            "e": pl.Series(
+                [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+            ),
+            "x": pl.Series(
+                [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+            ),
+            "a": pl.Series(
+                [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32))
+            ),
+            "e_shape_0": edge_features.shape[0],
+            "e_shape_1": edge_features.shape[1],
+            "x_shape_0": node_features.shape[0],
+            "x_shape_1": node_features.shape[1],
+            "a_shape_0": adjacency_matrix.shape[0],
+            "a_shape_1": adjacency_matrix.shape[1],
+            self.graph_id_col: d[self.graph_id_col][0],
+            self.label_col: d[self.label_col][0],
+        }
+    
+    def _convert(self):
         result_df = self.dataset.group_by(
             ["game_id", "frame_id"], maintain_order=True
         ).agg(
             pl.map_groups(
-                exprs=[
-                    "x",
-                    "y",
-                    "z",
-                    "v",
-                    "vx",
-                    "vy",
-                    "vz",
-                    "a",
-                    "ax",
-                    "ay",
-                    "az",
-                    "team_id",
-                    "position_name",
-                    "ball_owning_team_id",
-                    self.graph_id_col,
-                    self.label_col,
-                ],
-                function=__compute,
+                exprs=self.__exprs_variables,
+                function=self.__compute,
             ).alias("result_dict")
         )
 
@@ -318,6 +369,8 @@ def __compute(args: List[pl.Series]) -> dict:
         )
 
         return graph_df.drop("result_dict")
+    
+    
 
     def to_graph_frames(self) -> List[dict]:
         def __convert_to_graph_data_list(df):
@@ -353,10 +406,10 @@ def __convert_to_graph_data_list(df):
                 graph_list.extend(chunk_graph_list)
 
             return graph_list
-
+        
         graph_df = self._convert()
-        self.graph_frames = __convert_to_graph_data_list(graph_df)
-
+        self.graph_frames = self.__convert_to_graph_data_list(graph_df)
+        
         return self.graph_frames
 
     def to_spektral_graphs(self) -> List[Graph]:
diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py
index 0ef8dce..4e934a9 100644
--- a/unravel/soccer/graphs/graph_settings_pl.py
+++ b/unravel/soccer/graphs/graph_settings_pl.py
@@ -8,10 +8,9 @@
 
 
 @dataclass
-class GraphSettingsPL(DefaultGraphSettings):
+class GraphSettingsPolars(DefaultGraphSettings):
     ball_id: str = "ball"
     goalkeeper_id: str = "GK"
-    infer_goalkeepers: bool = True
     boundary_correction: float = None
     non_potential_receiver_node_value: float = 0.1
     ball_carrier_treshold: float = 25.0
diff --git a/unravel/utils/objects/default_graph_settings.py b/unravel/utils/objects/default_graph_settings.py
index fd67519..d77b5c1 100644
--- a/unravel/utils/objects/default_graph_settings.py
+++ b/unravel/utils/objects/default_graph_settings.py
@@ -18,9 +18,6 @@ class DefaultGraphSettings:
     Attributes:
         infer_ball_ownership (bool):
             Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz.
-            Also infers ball_carrier within ball_carrier_threshold
-        infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame
-        ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0.
         max_player_speed (float): The maximum speed of a player in meters per second. Defaults to 12.0.
         max_ball_speed (float): The maximum speed of the ball in meters per second. Defaults to 28.0.
         boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None.

From 37ad16c25603bad8e17debff7c160d94e1eac604 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 08:40:45 +0100
Subject: [PATCH 03/10] polars implementation

---
 tests/test_kloppy_polars.py                   |  64 ++-
 unravel/soccer/graphs/dataset.py              | 513 +++++++++++-------
 .../graphs/features/adjacency_matrix_pl.py    |  10 +-
 .../soccer/graphs/features/node_features.py   |   2 +-
 .../graphs/features/node_features_pl.py       | 167 +-----
 unravel/soccer/graphs/graph_converter_pl.py   | 300 +++++-----
 unravel/soccer/graphs/graph_settings_pl.py    |   5 +-
 unravel/utils/features/utils.py               |   2 +-
 8 files changed, 528 insertions(+), 535 deletions(-)

diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py
index 08e3e6a..4d70d2a 100644
--- a/tests/test_kloppy_polars.py
+++ b/tests/test_kloppy_polars.py
@@ -1,8 +1,5 @@
 from pathlib import Path
-from unravel.soccer import (
-    SoccerGraphConverterPolars, 
-    KloppyPolarsDataset
-)
+from unravel.soccer import SoccerGraphConverterPolars, KloppyPolarsDataset
 from unravel.utils import (
     dummy_labels,
     dummy_graph_ids,
@@ -38,24 +35,24 @@ def kloppy_dataset(self, match_data: str, structured_data: str) -> TrackingDatas
             include_empty_frames=False,
             limit=500,
         )
-        
+
     @pytest.fixture()
-    def kloppy_polars_dataset(self, kloppy_dataset: TrackingDataset) -> KloppyPolarsDataset:
+    def kloppy_polars_dataset(
+        self, kloppy_dataset: TrackingDataset
+    ) -> KloppyPolarsDataset:
         dataset = KloppyPolarsDataset(
             kloppy_dataset=kloppy_dataset,
             ball_carrier_threshold=25.0,
         )
         dataset.load()
-        dataset.add_dummy_labels(
-            by=["game_id", "frame_id"]
-        )
-        dataset.add_graph_ids(
-            by=["game_id", "frame_id"]
-        )
+        dataset.add_dummy_labels(by=["game_id", "frame_id"])
+        dataset.add_graph_ids(by=["game_id", "frame_id"])
         return dataset
-    
+
     @pytest.fixture()
-    def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars:
+    def spc_padding(
+        self, kloppy_polars_dataset: KloppyPolarsDataset
+    ) -> SoccerGraphConverterPolars:
         return SoccerGraphConverterPolars(
             dataset=kloppy_polars_dataset,
             chunk_size=2_0000,
@@ -75,13 +72,15 @@ def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraph
         )
 
     @pytest.fixture()
-    def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars:
-        # TODO: 
-        # check if 
+    def soccer_polars_converter(
+        self, kloppy_polars_dataset: KloppyPolarsDataset
+    ) -> SoccerGraphConverterPolars:
+        # TODO:
+        # check if
         # - random_seed
         # - padding needs to be per team_id otherwise stuff breaks
         # all work as expected and/or should be moved to the KloppyPolarsDataset
-        
+
         return SoccerGraphConverterPolars(
             dataset=kloppy_polars_dataset,
             chunk_size=2_0000,
@@ -123,9 +122,6 @@ def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) ->
     #         verbose=False,
     #     )
     def test_padding(self, spc_padding: SoccerGraphConverterPolars):
-        """
-        Test navigating (next/prev) through events
-        """
         spektral_graphs = spc_padding.to_spektral_graphs()
 
         assert 1 == 1
@@ -134,32 +130,36 @@ def test_padding(self, spc_padding: SoccerGraphConverterPolars):
         assert len(data) == 384
         assert isinstance(data[0], Graph)
 
-    def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPolars):
+    def test_to_spektral_graph(
+        self, soccer_polars_converter: SoccerGraphConverterPolars
+    ):
         """
         Test navigating (next/prev) through events
         """
         spektral_graphs = soccer_polars_converter.to_spektral_graphs()
 
         assert 1 == 1
-        
+
         data = spektral_graphs
+        assert data[0].id == "2417-1529"
         assert len(data) == 489
         assert isinstance(data[0], Graph)
 
         x = data[0].x
-        assert x.shape == (10, 12)
-        assert 0.31373436337428573 == pytest.approx(x[0, 0], abs=1e-5)
-        assert 0.06765375015355701 == pytest.approx(x[0, 4], abs=1e-5)
-        assert 0.47729475229688306 == pytest.approx(x[8, 2], abs=1e-5)
+        n_players = x.shape[0]
+        assert x.shape == (n_players, 15)
+        assert 0.4524340998288571 == pytest.approx(x[0, 0], abs=1e-5)
+        assert 0.9948105277764999 == pytest.approx(x[0, 4], abs=1e-5)
+        assert 0.2941671698429814 == pytest.approx(x[8, 2], abs=1e-5)
 
         e = data[0].e
-        assert e.shape == (60, 6)
+        assert e.shape == (129, 6)
         assert 0.0 == pytest.approx(e[0, 0], abs=1e-5)
         assert 0.5 == pytest.approx(e[0, 4], abs=1e-5)
-        assert 0.579979482018554 == pytest.approx(e[8, 2], abs=1e-5)
+        assert 0.7140882876637022 == pytest.approx(e[8, 2], abs=1e-5)
 
         a = data[0].a
-        assert a.shape == (10, 10)
+        assert a.shape == (n_players, n_players)
         assert 1.0 == pytest.approx(a[0, 0], abs=1e-5)
         assert 1.0 == pytest.approx(a[0, 4], abs=1e-5)
         assert 0.0 == pytest.approx(a[8, 2], abs=1e-5)
@@ -167,7 +167,7 @@ def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPo
         dataset = CustomSpektralDataset(graphs=spektral_graphs)
         N, F, S, n_out, n = dataset.dimensions()
         assert N == 20
-        assert F == 12
+        assert F == 15
         assert S == 6
         assert n_out == 1
         assert n == 489
@@ -213,5 +213,3 @@ def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPo
             dataset.split_test_train(
                 split_train=4, split_test=5, by_graph_id=True, random_seed=42
             )
-
-    
diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py
index b9c4243..e9d66b5 100644
--- a/unravel/soccer/graphs/dataset.py
+++ b/unravel/soccer/graphs/dataset.py
@@ -20,6 +20,44 @@
 DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 1}
 
 
+class Constant:
+    BALL = "ball"
+
+
+class Column:
+    BALL_OWNING_TEAM_ID = "ball_owning_team_id"
+    BALL_OWNING_PLAYER_ID = "ball_owning_player_id"
+    IS_BALL_CARRIER = "is_ball_carrier"
+    PERIOD_ID = "period_id"
+    TIMESTAMP = "timestamp"
+    BALL_STATE = "ball_state"
+    FRAME_ID = "frame_id"
+    GAME_ID = "game_id"
+    TEAM_ID = "team_id"
+    OBJECT_ID = "id"
+    POSITION_NAME = "position_name"
+
+    X = "x"
+    Y = "y"
+    Z = "z"
+
+    V = "v"
+    VX = "vx"
+    VY = "vy"
+    VZ = "vz"
+
+    A = "a"
+    AX = "ax"
+    AY = "ay"
+    AZ = "az"
+
+
+class Group:
+    BY_FRAME = [Column.GAME_ID, Column.PERIOD_ID, Column.FRAME_ID]
+    BY_FRAME_TEAM = [Column.GAME_ID, Column.PERIOD_ID, Column.FRAME_ID, Column.TEAM_ID]
+    BY_OBJECT_PERIOD = [Column.OBJECT_ID, Column.PERIOD_ID]
+
+
 @dataclass
 class SoccerObject:
     id: Union[str, int]
@@ -30,17 +68,19 @@ class SoccerObject:
 @dataclass
 class KloppyPolarsDataset(DefaultDataset):
     kloppy_dataset: TrackingDataset
-    ball_carrier_threshold: float = None
-    _identifier_column: str = field(default="id", init=False)
+    ball_carrier_threshold: float = 25.0
     _graph_id_column: str = field(default="graph_id")
     _label_column: str = field(default="label")
-    _partition_by: List[str] = field(
-        default_factory=lambda: ["id", "period_id"], init=False
-    )
-    _infer_ball_owning_team_id: bool = field(default=False, init=False)
     _overwrite_orientation: bool = field(default=False, init=False)
     _infer_goalkeepers: bool = field(default=False, init=False)
-    
+
+    def __post_init__(self):
+        if not isinstance(self.kloppy_dataset, TrackingDataset):
+            raise Exception("'kloppy_dataset' should be of type float")
+
+        if not isinstance(self.ball_carrier_threshold, float):
+            raise Exception("'ball_carrier_threshold' should be of type float")
+
     def __transform_orientation(self):
         if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM:
             self._overwrite_orientation = True
@@ -48,7 +88,7 @@ def __transform_orientation(self):
             to_orientation = Orientation.STATIC_HOME_AWAY
         else:
             to_orientation = Orientation.BALL_OWNING_TEAM
-            
+
         self.kloppy_dataset = DatasetTransformer.transform_dataset(
             dataset=self.kloppy_dataset,
             to_orientation=to_orientation,
@@ -62,11 +102,14 @@ def __transform_orientation(self):
     def __get_objects(self):
         def __artificial_game_id() -> str:
             from uuid import uuid4
+
             return str(uuid4())
-        
+
         home_team, away_team = self.kloppy_dataset.metadata.teams
-        
-        if all(item is None for item in [p.starting_position for p in home_team.players]):
+
+        if all(
+            item is None for item in [p.starting_position for p in home_team.players]
+        ):
             self._infer_goalkeepers = True
             home_players = [
                 SoccerObject(p.player_id, p.team.team_id, None)
@@ -85,7 +128,7 @@ def __artificial_game_id() -> str:
                 SoccerObject(p.player_id, p.team.team_id, p.starting_position.code)
                 for p in away_team.players
             ]
-        ball_object = SoccerObject("ball", "ball", "ball")
+        ball_object = SoccerObject(Constant.BALL, Constant.BALL, Constant.BALL)
         game_id = self.kloppy_dataset.metadata.game_id
         if game_id is None:
             game_id = __artificial_game_id()
@@ -96,17 +139,17 @@ def __unpivot(self, object, coordinate):
 
         return self.data.unpivot(
             index=[
-                "period_id",
-                "timestamp",
-                "frame_id",
-                "ball_state",
-                "ball_owning_team_id",
+                Column.PERIOD_ID,
+                Column.TIMESTAMP,
+                Column.FRAME_ID,
+                Column.BALL_STATE,
+                Column.BALL_OWNING_TEAM_ID,
             ],  # Columns to keep
             on=[column],
             value_name=coordinate,
-            variable_name=self._identifier_column,
+            variable_name=Column.OBJECT_ID,
         ).with_columns(
-            pl.col(self._identifier_column).str.replace(
+            pl.col(Column.OBJECT_ID).str.replace(
                 f"_{coordinate}", ""
             )  # Remove the coordinate suffix
         )
@@ -129,9 +172,13 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict):
                 "Missing parameter 'polyorder' in player_smoothing_params and/or ball_smoothing_params"
             )
 
-        smoothed = df.group_by(self._partition_by, maintain_order=True).agg(
+        vx_smooth = f"{Column.VX}_smoothed"
+        vy_smooth = f"{Column.VY}_smoothed"
+        vz_smooth = f"{Column.VZ}_smoothed"
+
+        smoothed = df.group_by(Group.BY_OBJECT_PERIOD, maintain_order=True).agg(
             [
-                pl.col("vx")
+                pl.col(Column.VX)
                 .map_elements(
                     lambda vx: savgol_filter(
                         vx,
@@ -140,8 +187,8 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict):
                     ).tolist(),
                     return_dtype=pl.List(pl.Float64),
                 )
-                .alias("vx_smoothed"),
-                pl.col("vy")
+                .alias(vx_smooth),
+                pl.col(Column.VY)
                 .map_elements(
                     lambda vy: savgol_filter(
                         vy,
@@ -150,8 +197,8 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict):
                     ).tolist(),
                     return_dtype=pl.List(pl.Float64),
                 )
-                .alias("vy_smoothed"),
-                pl.col("vz")
+                .alias(vy_smooth),
+                pl.col(Column.VZ)
                 .map_elements(
                     lambda vy: savgol_filter(
                         vy,
@@ -160,18 +207,16 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict):
                     ).tolist(),
                     return_dtype=pl.List(pl.Float64),
                 )
-                .alias("vz_smoothed"),
+                .alias(vz_smooth),
             ]
         )
         # Explode the smoothed columns back to original shape
-        smoothed_exploded = smoothed.explode(
-            ["vx_smoothed", "vy_smoothed", "vz_smoothed"]
-        )
+        smoothed_exploded = smoothed.explode([vx_smooth, vy_smooth, vz_smooth])
         # Combine with the original DataFrame if needed
         return df.with_columns(
-            vx=smoothed_exploded["vx_smoothed"],
-            vy=smoothed_exploded["vy_smoothed"],
-            vz=smoothed_exploded["vz_smoothed"],
+            vx=smoothed_exploded[vx_smooth],
+            vy=smoothed_exploded[vy_smooth],
+            vz=smoothed_exploded[vz_smooth],
         )
 
     def __add_velocity(
@@ -181,60 +226,65 @@ def __add_velocity(
         ball_smoothing_params: dict,
     ):
         df = (
-            df.sort(["id", "period_id", "timestamp", "team_id"], nulls_last=True)
+            df.sort(
+                Group.BY_OBJECT_PERIOD + [Column.TIMESTAMP, Column.TEAM_ID],
+                nulls_last=True,
+            )
             .with_columns(
                 [
                     # Calculate differences within each group
-                    pl.col("x").diff().over(self._partition_by).alias("dx"),
-                    pl.col("y").diff().over(self._partition_by).alias("dy"),
-                    pl.col("z").diff().over(self._partition_by).alias("dz"),
-                    (pl.col("timestamp").dt.total_milliseconds() / 1_000)
+                    pl.col(Column.X).diff().over(Group.BY_OBJECT_PERIOD).alias("dx"),
+                    pl.col(Column.Y).diff().over(Group.BY_OBJECT_PERIOD).alias("dy"),
+                    pl.col(Column.Z).diff().over(Group.BY_OBJECT_PERIOD).alias("dz"),
+                    (pl.col(Column.TIMESTAMP).dt.total_milliseconds() / 1_000)
                     .diff()
-                    .over(self._partition_by)
+                    .over(Group.BY_OBJECT_PERIOD)
                     .alias("dt"),
                 ]
             )
             .with_columns(
                 [
                     # Compute velocity components
-                    (pl.col("dx") / pl.col("dt")).alias("vx"),
-                    (pl.col("dy") / pl.col("dt")).alias("vy"),
-                    (pl.col("dz") / pl.col("dt")).alias("vz"),
+                    (pl.col("dx") / pl.col("dt")).alias(Column.VX),
+                    (pl.col("dy") / pl.col("dt")).alias(Column.VY),
+                    (pl.col("dz") / pl.col("dt")).alias(Column.VZ),
                 ]
             )
             .with_columns(
                 [
                     # Fill null values in vx and vy
-                    pl.col("vx").fill_null(0).alias("vx"),
-                    pl.col("vy").fill_null(0).alias("vy"),
-                    pl.col("vz").fill_null(0).alias("vz"),
+                    pl.col(Column.VX).fill_null(0).alias(Column.VX),
+                    pl.col(Column.VY).fill_null(0).alias(Column.VY),
+                    pl.col(Column.VZ).fill_null(0).alias(Column.VZ),
                 ]
             )
         )
 
         if player_smoothing_params:
             player_df = self.__apply_smoothing(
-                df=df.filter(pl.col(self._identifier_column) != self._ball_object.id),
+                df=df.filter(pl.col(Column.OBJECT_ID) != self._ball_object.id),
                 smoothing_params=player_smoothing_params,
             )
         else:
-            player_df = df.filter(
-                pl.col(self._identifier_column) != self._ball_object.id
-            )
+            player_df = df.filter(pl.col(Column.OBJECT_ID) != self._ball_object.id)
 
         if ball_smoothing_params:
             ball_df = self.__apply_smoothing(
-                df.filter(pl.col(self._identifier_column) == self._ball_object.id),
+                df.filter(pl.col(Column.OBJECT_ID) == self._ball_object.id),
                 smoothing_params=ball_smoothing_params,
             )
         else:
-            ball_df = df.filter(pl.col(self._identifier_column) == self._ball_object.id)
+            ball_df = df.filter(pl.col(Column.OBJECT_ID) == self._ball_object.id)
         df = pl.concat([player_df, ball_df])
         df = df.with_columns(
             [
-                (pl.col("vx") ** 2 + pl.col("vy") ** 2 + pl.col("vz") ** 2)
+                (
+                    pl.col(Column.VX) ** 2
+                    + pl.col(Column.VY) ** 2
+                    + pl.col(Column.VZ) ** 2
+                )
                 .sqrt()
-                .alias("v")
+                .alias(Column.V)
             ]
         )
 
@@ -245,33 +295,37 @@ def __add_acceleration(self, df: pl.DataFrame):
             df.with_columns(
                 [
                     # Calculate differences in vx, vy, and dt for acceleration
-                    pl.col("vx").diff().over(self._partition_by).alias("dvx"),
-                    pl.col("vy").diff().over(self._partition_by).alias("dvy"),
-                    pl.col("vz").diff().over(self._partition_by).alias("dvz"),
+                    pl.col(Column.VX).diff().over(Group.BY_OBJECT_PERIOD).alias("dvx"),
+                    pl.col(Column.VY).diff().over(Group.BY_OBJECT_PERIOD).alias("dvy"),
+                    pl.col(Column.VZ).diff().over(Group.BY_OBJECT_PERIOD).alias("dvz"),
                 ]
             )
             .with_columns(
                 [
                     # Compute ax and ay
-                    (pl.col("dvx") / pl.col("dt")).alias("ax"),
-                    (pl.col("dvy") / pl.col("dt")).alias("ay"),
-                    (pl.col("dvz") / pl.col("dt")).alias("az"),
+                    (pl.col("dvx") / pl.col("dt")).alias(Column.AX),
+                    (pl.col("dvy") / pl.col("dt")).alias(Column.AY),
+                    (pl.col("dvz") / pl.col("dt")).alias(Column.AZ),
                 ]
             )
             .with_columns(
                 [
                     # Fill null values in vx and vy
-                    pl.col("ax").fill_null(0).alias("ax"),
-                    pl.col("ay").fill_null(0).alias("ay"),
-                    pl.col("az").fill_null(0).alias("az"),
+                    pl.col(Column.AX).fill_null(0).alias(Column.AX),
+                    pl.col(Column.AY).fill_null(0).alias(Column.AY),
+                    pl.col(Column.AZ).fill_null(0).alias(Column.AZ),
                 ]
             )
             .with_columns(
                 [
                     # Compute magnitude of acceleration a
-                    (pl.col("ax") ** 2 + pl.col("ay") ** 2 + pl.col("az") ** 2)
+                    (
+                        pl.col(Column.AX) ** 2
+                        + pl.col(Column.AY) ** 2
+                        + pl.col(Column.AZ) ** 2
+                    )
                     .sqrt()
-                    .alias("a")
+                    .alias(Column.A)
                 ]
             )
         )
@@ -289,17 +343,19 @@ def __melt(
 
         for object in [ball_object] + home_players + away_players:
             melted_object_dfs = []
-            for k, coordinate in enumerate(["x", "y", "z"]):
-                if object.id != "ball" and coordinate == "z":
+            for k, coordinate in enumerate([Column.X, Column.Y, Column.Z]):
+                if object.id != Constant.BALL and coordinate == Column.Z:
                     continue
                 if not any(object.id in column for column in columns):
                     continue
 
                 melted_df = self.__unpivot(object, coordinate)
-                
-                if object.id == "ball" and coordinate == "z":
+
+                if object.id == Constant.BALL and coordinate == Column.Z:
                     if melted_df[coordinate].is_null().all():
-                        melted_df = melted_df.with_columns([pl.lit(0.0).alias("z")])
+                        melted_df = melted_df.with_columns(
+                            [pl.lit(0.0).alias(Column.Z)]
+                        )
                 if k == 0:
                     melted_object_dfs.append(melted_df)
                 else:
@@ -307,146 +363,188 @@ def __melt(
 
             if melted_object_dfs:
                 object_df = pl.concat(melted_object_dfs, how="horizontal")
-                if "z" not in object_df.columns:
-                    object_df = object_df.with_columns([pl.lit(0.0).alias("z")])
+                if Column.Z not in object_df.columns:
+                    object_df = object_df.with_columns([pl.lit(0.0).alias(Column.Z)])
                 object_df = object_df.with_columns(
                     [
-                        pl.lit(object.team_id).cast(pl.Utf8).alias("team_id"),
-                        pl.lit(object.position_name).alias("position_name"),
+                        pl.lit(object.team_id).cast(pl.Utf8).alias(Column.TEAM_ID),
+                        pl.lit(object.position_name).alias(Column.POSITION_NAME),
                     ]
                 )
 
                 melted_dfs.append(object_df)
-        
+
         df = pl.concat(melted_dfs, how="vertical")
-        df = df.with_columns([pl.lit(game_id).alias("game_id")])
-        df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True)
-        return df
-    
-    def __get_inferred_ball_owning_team_id(self, df: pl.DataFrame):
-        non_ball_owning_team = (
-            df.filter(pl.col("ball_owning_team_id").is_null())
-        )
-        ball_owning_team = (
-            df.filter(~pl.col("ball_owning_team_id").is_null())
-        )
-        
-        ball = (
-            non_ball_owning_team.filter(pl.col('team_id') == "ball")
-        )
-        players = (
-            non_ball_owning_team.filter(pl.col('team_id') != "ball")
+        df = df.with_columns([pl.lit(game_id).alias(Column.GAME_ID)])
+        df = df.sort(
+            by=[Column.PERIOD_ID, Column.TIMESTAMP, Column.TEAM_ID], nulls_last=True
         )
+        return df
+
+    def __infer_ball_carrier(self, df: pl.DataFrame):
+        if Column.BALL_OWNING_PLAYER_ID not in df.columns:
+            df = df.with_columns(
+                pl.lit(False)
+                .cast(df.schema[Column.OBJECT_ID])
+                .alias(Column.BALL_OWNING_PLAYER_ID)
+            )
+
+        # handle the non ball owning frames
+        ball = df.filter(pl.col(Column.TEAM_ID) == Constant.BALL)
+        players = df.filter(pl.col(Column.TEAM_ID) != Constant.BALL)
+
+        # ball owning team is empty, so we can drop it. Goal is to replace it
         result = (
-            players.drop('ball_owning_team_id')
-            .join(
+            players.join(
                 ball.select(
-                    ['game_id', 'period_id', 'frame_id', 
-                    pl.col('x').alias('ball_x'),
-                    pl.col('y').alias('ball_y'), 
-                    pl.col('z').alias('ball_z')]
+                    Group.BY_FRAME
+                    + [
+                        pl.col(Column.X).alias("ball_x"),
+                        pl.col(Column.Y).alias("ball_y"),
+                        pl.col(Column.Z).alias("ball_z"),
+                    ]
                 ),
-                on=['game_id', 'period_id', 'frame_id'],
-                how='left'
+                on=Group.BY_FRAME,
+                how="left",
             )
-            .with_columns([
-                ((pl.col('x') - pl.col('ball_x'))**2 + 
-                (pl.col('y') - pl.col('ball_y'))**2 + 
-                (pl.col('z') - pl.col('ball_z'))**2
-                ).sqrt().alias('distance')
-            ])
-            .group_by(['game_id', 'period_id', 'frame_id'])
-            .agg([
-                pl.when(pl.col('distance').min() < self.ball_carrier_threshold)
-                .then(pl.col('team_id').filter(pl.col('distance') == pl.col('distance').min()).first())
-                .otherwise(None)
-                .alias('ball_owning_team_id'),
-                pl.all().sort_by('distance').first()
-            ])
-        )
-        non_ball_owning_team = (
-            non_ball_owning_team.drop('ball_owning_team_id')
-            .join(
-                result.select(['game_id', 'period_id', 'frame_id', 'ball_owning_team_id']),
-                on=['game_id', 'period_id', 'frame_id'],
-                how='left'
+            .with_columns(
+                [
+                    (
+                        (pl.col(Column.X) - pl.col("ball_x")) ** 2
+                        + (pl.col(Column.Y) - pl.col("ball_y")) ** 2
+                        + (pl.col(Column.Z) - pl.col("ball_z")) ** 2
+                    )
+                    .sqrt()
+                    .alias("ball_dist")
+                ]
             )
-            .filter(
-                ~pl.col("ball_owning_team_id").is_null()
+            .group_by(Group.BY_FRAME)
+            .agg(
+                [
+                    pl.when((pl.col(Column.BALL_OWNING_TEAM_ID).is_null()))
+                    .then(
+                        pl.col(Column.TEAM_ID)
+                        .filter(
+                            (pl.col("ball_dist") == pl.col("ball_dist").min())
+                            & (pl.col("ball_dist").min() < self.ball_carrier_threshold)
+                        )
+                        .first()
+                    )
+                    .otherwise(pl.col(Column.BALL_OWNING_TEAM_ID))
+                    .alias(Column.BALL_OWNING_TEAM_ID),
+                    pl.when((pl.col(Column.BALL_OWNING_PLAYER_ID).is_null()))
+                    .then(
+                        pl.col(Column.OBJECT_ID)
+                        .filter(
+                            (pl.col("ball_dist") == pl.col("ball_dist").min())
+                            & (pl.col("ball_dist").min() < self.ball_carrier_threshold)
+                        )
+                        .first()
+                    )
+                    .otherwise(pl.col(Column.BALL_OWNING_PLAYER_ID))
+                    .alias(Column.BALL_OWNING_PLAYER_ID),
+                ]
+            )
+            .with_columns(
+                [
+                    pl.col(Column.BALL_OWNING_PLAYER_ID)
+                    .list.first()
+                    .alias(Column.BALL_OWNING_PLAYER_ID),
+                    pl.col(Column.BALL_OWNING_TEAM_ID)
+                    .list.first()
+                    .alias(Column.BALL_OWNING_TEAM_ID),
+                ]
             )
-            .with_columns([
-                pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id'])
-            ])
-            .select(ball_owning_team.columns)
-        )
-        ball_owning_team = (
-            ball_owning_team
-            .with_columns([
-                pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id'])
-            ])
         )
-        
-        new_df = (
-            pl.concat([
-                ball_owning_team,
-                non_ball_owning_team
-            ], how="vertical")
-            .sort(['game_id', 'period_id', 'frame_id', 'team_id'])
+        df = (
+            df.drop([Column.BALL_OWNING_PLAYER_ID, Column.BALL_OWNING_TEAM_ID])
+            .join(result, how="left", on=Group.BY_FRAME)
+            .with_columns(
+                pl.when(
+                    pl.col(Column.OBJECT_ID) == pl.col(Column.BALL_OWNING_PLAYER_ID)
+                )
+                .then(True)
+                .otherwise(False)
+                .alias(Column.IS_BALL_CARRIER)
+            )
+            .drop(Column.BALL_OWNING_PLAYER_ID)
+            .drop_nulls(subset=Column.BALL_OWNING_TEAM_ID)
         )
-        return new_df
-    
-    def __get_inferred_goalkeepers(self, df: pl.DataFrame):
+        return df
+
+    def __infer_goalkeepers(self, df: pl.DataFrame):
         goal_x = self.pitch_dimensions.pitch_length / 2
         goal_y = 0
-        
-        df_with_distances = (
-            df.filter(pl.col('team_id') != "ball")
-            .with_columns([
-                ((pl.col('x') - (-goal_x))**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_left'),
-                ((pl.col('x') - goal_x)**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_right')
-            ])
+
+        df_with_distances = df.filter(
+            pl.col(Column.TEAM_ID) != Constant.BALL
+        ).with_columns(
+            [
+                ((pl.col(Column.X) - (-goal_x)) ** 2 + (pl.col(Column.Y) - goal_y) ** 2)
+                .sqrt()
+                .alias("dist_left"),
+                ((pl.col(Column.X) - goal_x) ** 2 + (pl.col(Column.Y) - goal_y) ** 2)
+                .sqrt()
+                .alias("dist_right"),
+            ]
         )
         result = (
-            df_with_distances
-            .with_columns([
-                pl.col('dist_left').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_left'),
-                pl.col('dist_right').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_right')
-            ])
-            .with_columns([
-                pl.when(pl.col('team_id') == pl.col('ball_owning_team_id'))
-                .then(
-                    pl.when(pl.col('dist_left') == pl.col('min_dist_left'))
-                    .then(pl.lit('GK'))
-                    .otherwise(None)
-                )
-                .otherwise(
-                    pl.when(pl.col('dist_right') == pl.col('min_dist_right'))
-                    .then(pl.lit('GK'))
-                    .otherwise(None)
-                )
-                .alias('position_name')
-            ])
-            .drop(['min_dist_left', 'min_dist_right', 'dist_left', 'dist_right'])
+            df_with_distances.with_columns(
+                [
+                    pl.col("dist_left")
+                    .min()
+                    .over(Group.BY_FRAME_TEAM)
+                    .alias("min_dist_left"),
+                    pl.col("dist_right")
+                    .min()
+                    .over(Group.BY_FRAME_TEAM)
+                    .alias("min_dist_right"),
+                ]
+            )
+            .with_columns(
+                [
+                    pl.when(
+                        pl.col(Column.TEAM_ID) == pl.col(Column.BALL_OWNING_TEAM_ID)
+                    )
+                    .then(
+                        pl.when(pl.col("dist_left") == pl.col("min_dist_left"))
+                        .then(pl.lit("GK"))
+                        .otherwise(None)
+                    )
+                    .otherwise(
+                        pl.when(pl.col("dist_right") == pl.col("min_dist_right"))
+                        .then(pl.lit("GK"))
+                        .otherwise(None)
+                    )
+                    .alias("position_name")
+                ]
+            )
+            .drop(["min_dist_left", "min_dist_right", "dist_left", "dist_right"])
         )
-        ball_rows = df.filter(pl.col('team_id') == "ball")
+        ball_rows = df.filter(pl.col(Column.TEAM_ID) == Constant.BALL)
         non_ball_rows = result
 
-        return (
-            pl.concat([ball_rows, non_ball_rows], how="vertical")
-            .sort(['game_id', 'period_id', 'frame_id', 'team_id'])
+        return pl.concat([ball_rows, non_ball_rows], how="vertical").sort(
+            Group.BY_FRAME_TEAM
         )
-        
-    def __fix_orientation_to_ball_owning(self, df: pl.DataFrame, home_team_id: Union[str, int]):
+
+    def __fix_orientation_to_ball_owning(
+        self, df: pl.DataFrame, home_team_id: Union[str, int]
+    ):
         # When _overwrite_orientation is True, it means the orientation is "STATIC_HOME_AWAY"
         # This means that when away is the attacking team we can flip all coordinates by -1.0
-        
-        flip_columns = ['x', 'y', 'vx', 'vy', 'ax', 'ay']
-        
-        return df.with_columns([
-            pl.when(pl.col('ball_owning_team_id').cast(str) != str(home_team_id))
-            .then(pl.col(flip_columns) * -1)
-            .otherwise(pl.col(flip_columns))
-        ])
+
+        flip_columns = [Column.X, Column.Y, Column.VX, Column.VY, Column.AX, Column.AY]
+
+        return df.with_columns(
+            [
+                pl.when(
+                    pl.col(Column.BALL_OWNING_TEAM_ID).cast(str) != str(home_team_id)
+                )
+                .then(pl.col(flip_columns) * -1)
+                .otherwise(pl.col(flip_columns))
+            ]
+        )
 
     def load(
         self,
@@ -454,8 +552,10 @@ def load(
         ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS,
     ):
         if self.kloppy_dataset.metadata.orientation == Orientation.NOT_SET:
-            raise ValueError("Data sources with an undefined orientation can not be used inside the 'unravelsports' package...")
-        
+            raise ValueError(
+                "Data sources with an undefined orientation can not be used inside the 'unravelsports' package..."
+            )
+
         self.kloppy_dataset = self.__transform_orientation()
         self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions
 
@@ -466,40 +566,39 @@ def load(
         df = self.__melt(
             self._home_players, self._away_players, self._ball_object, self._game_id
         )
-        
+
         df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params)
         df = self.__add_acceleration(df)
         df = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"])
-        
-        df = df.filter(
-            ~(pl.col('x').is_null() & pl.col('y').is_null())
-        )
-        
-        if df['ball_owning_team_id'].is_null().all() and self.ball_carrier_threshold:
-                raise ValueError("This dataset requires us to infer the ball_owning_team_id, please specifiy a ball_carrier_threshold (float) to do so.")
-        
-        if self.ball_carrier_threshold is not None:
-            df = self.__get_inferred_ball_owning_team_id(df)
-            
+
+        df = df.filter(~(pl.col(Column.X).is_null() & pl.col(Column.Y).is_null()))
+
+        if (
+            df[Column.BALL_OWNING_TEAM_ID].is_null().all()
+            and self.ball_carrier_threshold is None
+        ):
+            raise ValueError(
+                f"This dataset requires us to infer the {Column.BALL_OWNING_TEAM_ID}, please specifiy a ball_carrier_threshold (float) to do so."
+            )
+
+        df = self.__infer_ball_carrier(df)
+
         if self._overwrite_orientation:
             home_team, _ = self.kloppy_dataset.metadata.teams
-            df = self.__fix_orientation_to_ball_owning(df, home_team_id=home_team.team_id)
-        
+            df = self.__fix_orientation_to_ball_owning(
+                df, home_team_id=home_team.team_id
+            )
+
         if self._infer_goalkeepers:
-            df = self.__get_inferred_goalkeepers(df)
-        
+            df = self.__infer_goalkeepers(df)
+
         self.data = df
         return self.data, self.pitch_dimensions
 
-    def add_dummy_labels(
-        self,
-        by: List[str] = ["game_id", "frame_id"]
-    ) -> pl.DataFrame:
+    def add_dummy_labels(self, by: List[str] = ["game_id", "frame_id"]) -> pl.DataFrame:
         self.data = add_dummy_label_column(self.data, by, self._label_column)
         return self.data
 
-    def add_graph_ids(
-        self, by: List[str] = ["game_id", "period_id"]
-    ) -> pl.DataFrame:
+    def add_graph_ids(self, by: List[str] = ["game_id", "period_id"]) -> pl.DataFrame:
         self.data = add_graph_id_column(self.data, by, self._graph_id_column)
         return self.data
diff --git a/unravel/soccer/graphs/features/adjacency_matrix_pl.py b/unravel/soccer/graphs/features/adjacency_matrix_pl.py
index 7a5b2d2..2e27ea2 100644
--- a/unravel/soccer/graphs/features/adjacency_matrix_pl.py
+++ b/unravel/soccer/graphs/features/adjacency_matrix_pl.py
@@ -3,19 +3,21 @@
 
 
 from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType, distance_to_ball
+from ..dataset import Constant
 
 
-def compute_adjacency_matrix_pl(team, possession_team, settings, ball_carrier_idx):
+def compute_adjacency_matrix_pl(team, ball_owning_team, settings, ball_carrier_idx):
     adjacency_matrix_type = settings.adjacency_matrix_type
     adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type
-    ball_id = settings.ball_id
+    ball_id = Constant.BALL
+
+    exclusion_ids = np.asarray([ball_id, *np.unique(ball_owning_team)])
 
-    exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)])
     defensive_team = np.setdiff1d(team, exclusion_ids)[0]
     if adjacency_matrix_type == AdjacencyMatrixType.DENSE:
         adjacency_matrix = np.ones((team.shape[0], team.shape[0])).astype(np.int32)
     elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_AP:
-        is_att = team == np.unique(possession_team)[0]
+        is_att = team == np.unique(ball_owning_team)[0]
         adjacency_matrix = np.outer(is_att, is_att).astype(int)
     elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_DP:
         is_def = team == defensive_team
diff --git a/unravel/soccer/graphs/features/node_features.py b/unravel/soccer/graphs/features/node_features.py
index 7127404..dd532b0 100644
--- a/unravel/soccer/graphs/features/node_features.py
+++ b/unravel/soccer/graphs/features/node_features.py
@@ -54,7 +54,7 @@ def player_features(p, team, potential_receiver=None):
             ),
             (
                 0.0
-                if np.isnan(p.x1)
+                if np.isnan(p.y1)
                 else normalize_coords(p.y1, pitch_dimensions.y_dim.max)
             ),
             0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0],
diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py
index c1132b3..804ebd4 100644
--- a/unravel/soccer/graphs/features/node_features_pl.py
+++ b/unravel/soccer/graphs/features/node_features_pl.py
@@ -28,6 +28,7 @@ def compute_node_features_pl(
     team,
     possession_team,
     is_gk,
+    ball_carrier,
     settings,
 ):
     ball_id = settings.ball_id
@@ -63,8 +64,8 @@ def compute_node_features_pl(
     uv_velocity = unit_vectors(velocity)
 
     angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0]))
-    sin_normed = normalize_sincos(np.sin(angles))
-    cos_normed = normalize_sincos(np.cos(angles))
+    v_sin_normed = normalize_sincos(np.sin(angles))
+    v_cos_normed = normalize_sincos(np.cos(angles))
 
     dist_to_goal = np.linalg.norm(position - goal_mouth_position, axis=1)
     normed_dist_to_goal = normalize_distance(
@@ -75,6 +76,16 @@ def compute_node_features_pl(
         value=dist_to_ball, max_distance=max_dist_to_player
     )
 
+    vec_to_goal = goal_mouth_position - position
+    angle_to_goal = np.arctan2(vec_to_goal[:, 1], vec_to_goal[:, 0])
+    goal_sin_normed = normalize_sincos(np.sin(angle_to_goal))
+    goal_cos_normed = normalize_sincos(np.cos(angle_to_goal))
+
+    vec_to_ball = ball_position - position
+    angle_to_ball = np.arctan2(vec_to_ball[:, 1], vec_to_ball[:, 0])
+    ball_sin_normed = normalize_sincos(np.sin(angle_to_ball))
+    ball_cos_normed = normalize_sincos(np.cos(angle_to_ball))
+
     is_possession_team = np.where(
         team == possession_team, 1, settings.defending_team_node_value
     )
@@ -86,159 +97,21 @@ def compute_node_features_pl(
             (
                 x_normed,
                 y_normed,
-                uv_velocity[:, 0],
-                uv_velocity[:, 1],
                 s_normed,
-                sin_normed,
-                cos_normed,
+                v_sin_normed,
+                v_cos_normed,
                 normed_dist_to_goal,
                 normed_dist_to_ball,
                 is_possession_team,
                 is_gk,
                 is_ball,
+                goal_sin_normed,
+                goal_cos_normed,
+                ball_sin_normed,
+                ball_cos_normed,
+                ball_carrier,
             ),
             axis=-1,
         )
     )
-
     return X
-
-
-# def node_features(
-#     attacking_players,
-#     defending_players,
-#     ball,
-#     max_player_speed,
-#     max_ball_speed,
-#     ball_carrier_idx,
-#     pitch_dimensions,
-#     include_ball_node: bool = True,
-#     defending_team_node_value: float = 0.1,
-#     non_potential_receiver_node_value: float = 0.1,
-# ):
-#     """
-#     node features matrix is (n_nodes, n_node_features) (<=23, 17)
-#     each player (and optionally ball) is a node
-
-#     player_features n_node_features must be equal to ball_features n_node_features
-#     """
-
-#     goal_mouth_position = (
-#         pitch_dimensions.pitch_length,
-#         pitch_dimensions.pitch_width / 2,
-#     )
-#     max_dist_to_player = np.sqrt(
-#         pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2
-#     )
-#     max_dist_to_goal = np.sqrt(
-#         pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2
-#     )
-
-#     def player_features(p, team, potential_receiver=None):
-#         ball_angle = math.atan2(p.y1 - ball.y1, p.x1 - ball.x1)
-#         goal_angle = math.atan2(
-#             p.y1 - goal_mouth_position[0], p.x1 - goal_mouth_position[1]
-#         )
-
-#         player_node_features = [
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else normalize_coords(p.x1, pitch_dimensions.x_dim.max)
-#             ),
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else normalize_coords(p.y1, pitch_dimensions.y_dim.max)
-#             ),
-#             0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0],
-#             0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[1],
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else round(normalize_speed(p.speed, max_speed=max_player_speed), 3)
-#             ),
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else normalize_angles(np.arctan2(p.velocity[1], p.velocity[0]))
-#             ),
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else normalize_distance(
-#                     np.linalg.norm(p.position - goal_mouth_position),
-#                     max_distance=max_dist_to_goal,
-#                 )
-#             ),  # distance to the goal mouth
-#             0.0 if np.isnan(p.x1) else normalize_angles(goal_angle),
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else normalize_distance(
-#                     np.linalg.norm(p.position - ball.position),
-#                     max_distance=max_dist_to_player,
-#                 )
-#             ),  # distance to the ball
-#             0.0 if np.isnan(p.x1) else normalize_angles(ball_angle),
-#             0.0 if np.isnan(p.x1) else team,
-#             # 1 if player is on same team but not in possession, 0.1 for all other players, 0.1 if the player is 'missing'
-#             (
-#                 0.0
-#                 if np.isnan(p.x1)
-#                 else 1.0 if potential_receiver else non_potential_receiver_node_value
-#             ),
-#         ]
-#         return player_node_features
-
-#     def ball_features(ball):
-#         goal_angle = math.atan2(
-#             ball.y1 - goal_mouth_position[1], ball.x1 - goal_mouth_position[0]
-#         )
-#         ball_node_features = [
-#             normalize_coords(ball.x1, pitch_dimensions.x_dim.max),
-#             normalize_coords(ball.y1, pitch_dimensions.y_dim.max),
-#             unit_vector(ball.velocity)[0],
-#             unit_vector(ball.velocity)[1],
-#             round(normalize_speed(ball.speed, max_speed=max_ball_speed), 3),
-#             normalize_angles(np.arctan2(ball.velocity[1], ball.velocity[0])),
-#             normalize_distance(
-#                 np.linalg.norm(ball.position - goal_mouth_position),
-#                 max_distance=max_dist_to_goal,
-#             ),  # distance to the goal mouth
-#             normalize_angles(goal_angle),
-#             # ball_angle 2x, ball_dist 2x, attacking_team 2x, ball carrier, potential receiver (all always 0 for ball)
-#             0,
-#             0,
-#             0,
-#             0,  # , 0
-#         ]
-
-#         return np.asarray([ball_node_features])
-
-#     # loop over attacking players, grab ball_carrier, potential receiver and intended receiver
-#     ap_features = np.asarray(
-#         [
-#             player_features(p, team=1, potential_receiver=(i != ball_carrier_idx))
-#             for i, p in enumerate(attacking_players)
-#         ]
-#     )
-
-#     # loop over defending playres, we don't have ball_carrier, or receivers
-#     dp_features = np.asarray(
-#         [
-#             player_features(p, team=defending_team_node_value)
-#             for i, p in enumerate(defending_players)
-#         ]
-#     )
-
-#     # compute ball features
-#     b_features = ball_features(ball)
-#     X = np.append(ap_features, dp_features, axis=0)
-
-#     if include_ball_node:
-#         X = np.append(X, b_features, axis=0)
-
-#     # convert np.NaN to 0 (zero)
-#     X = np.nan_to_num(X)
-#     return X
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index 252ff03..c1f835f 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -30,7 +30,7 @@
 )
 
 from .graph_settings_pl import GraphSettingsPolars
-from .dataset import KloppyPolarsDataset
+from .dataset import KloppyPolarsDataset, Column, Group, Constant
 from .features import (
     compute_node_features_pl,
     compute_adjacency_matrix_pl,
@@ -52,17 +52,7 @@ class SoccerGraphConverterPolars(DefaultGraphConverter):
 
     Attributes:
         dataset (TrackingDataset): Kloppy TrackingDataset.
-        labels (dict): Dict with a key per frame_id, like so {frame_id: True/False/1/0}
-        graph_id (str, int): Set a single id for the whole Kloppy dataset.
-        graph_ids (dict): Frame level control over graph ids.
-
-        The graph_ids will be used to assign each graph an identifier. This identifier allows us to split the CustomSpektralDataset such that
-            all graphs with the same id are either all in the test, train or validation set to avoid leakage. It is recommended to either set graph_id (int, str) as
-            a match_id, or pass a dictionary into 'graph_ids' with exactly the same keys as 'labels' for more granualar control over the graph ids.
-        The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}.
-        Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None.
-
-        ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0.
+        chunk_size (int): Determines how many Graphs get processed simultanously.
         non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players
     """
 
@@ -75,145 +65,169 @@ def __post_init__(self):
         self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions
         self.label_col = self.dataset._label_column
         self.graph_id_col = self.dataset._graph_id_column
-        
-        self.ball_carrier_threshold = self.dataset.ball_carrier_threshold
+
         self.dataset = self.dataset.data
 
         self._sport_specific_checks()
         self.settings = self._apply_settings()
         self.dataset = self._apply_filters()
-        
+
         if self.pad:
             self.dataset = self._apply_padding(df=self.dataset)
-    
-    @staticmethod   
-    def _apply_padding(df: pl.DataFrame) -> pl.DataFrame:
+
+    def _apply_padding(self, df: pl.DataFrame) -> pl.DataFrame:
         keep_columns = [
-            'timestamp',
-            'ball_state',
-            'position_name',
-            'label',
-            'graph_id'
+            Column.TIMESTAMP,
+            Column.BALL_STATE,
+            Column.POSITION_NAME,
+            self.label_col,
+            self.graph_id_col,
         ]
         empty_columns = [
-            'id', 'x', 'y', 'z', 'vx', 'vy',
-            'vz', 'v', 'ax', 'ay', 'az', 'a'
+            Column.OBJECT_ID,
+            Column.IS_BALL_CARRIER,
+            Column.X,
+            Column.Y,
+            Column.Z,
+            Column.VX,
+            Column.VY,
+            Column.VZ,
+            Column.V,
+            Column.AX,
+            Column.AY,
+            Column.AZ,
+            Column.A,
         ]
-        group_by_columns = ['game_id', 'period_id', 'frame_id', 'team_id', 'ball_owning_team_id']
-        
-        counts = (
-            df.group_by(group_by_columns)
-            .agg(
-                pl.len().alias('count'),
-                *[pl.first(col).alias(col) for col in keep_columns]
-            )
+        group_by_columns = [
+            Column.GAME_ID,
+            Column.PERIOD_ID,
+            Column.FRAME_ID,
+            Column.TEAM_ID,
+            Column.BALL_OWNING_TEAM_ID,
+        ]
+
+        counts = df.group_by(group_by_columns).agg(
+            pl.len().alias("count"), *[pl.first(col).alias(col) for col in keep_columns]
         )
-        
-        counts = counts.with_columns([
-            pl.when(pl.col('team_id') == "ball")
-            .then(1)
-            .when(pl.col('team_id') == pl.col('ball_owning_team_id'))
-            .then(11)
-            .otherwise(11)
-            .alias('target_length')
-        ])
-        
-        groups_to_pad = (
-            counts
-            .filter(pl.col('count') < pl.col('target_length'))
-            .with_columns(
-                (pl.col('target_length') - pl.col('count')).alias('repeats')
-            )
+
+        counts = counts.with_columns(
+            [
+                pl.when(pl.col(Column.TEAM_ID) == Constant.BALL)
+                .then(1)
+                .when(pl.col(Column.TEAM_ID) == pl.col(Column.BALL_OWNING_TEAM_ID))
+                .then(11)
+                .otherwise(11)
+                .alias("target_length")
+            ]
         )
-        
+
+        groups_to_pad = counts.filter(
+            pl.col("count") < pl.col("target_length")
+        ).with_columns((pl.col("target_length") - pl.col("count")).alias("repeats"))
+
         if len(groups_to_pad) == 0:
             return df
-            
+
         padding_rows = []
         for row in groups_to_pad.iter_rows(named=True):
             base_row = {col: row[col] for col in keep_columns + group_by_columns}
-            padding_rows.extend([base_row] * row['repeats'])
-        
+            padding_rows.extend([base_row] * row["repeats"])
+
         padding_df = pl.DataFrame(padding_rows)
-        
+
         schema = df.schema
-        padding_df = padding_df.with_columns([
-            pl.lit(0.0 if schema[col] != pl.String else "None").cast(schema[col]).alias(col)
-            for col in empty_columns
-        ])
-        
-        padding_df = padding_df.select(df.columns)
-        
-        result = pl.concat([df, padding_df], how='vertical')
-        
-        total_frames = (
-            result.select(['game_id', 'period_id', 'frame_id'])
-            .unique()
-            .height
+        padding_df = padding_df.with_columns(
+            [
+                pl.lit(0.0 if schema[col] != pl.String else "None")
+                .cast(schema[col])
+                .alias(col)
+                for col in empty_columns
+            ]
         )
-        
+
+        padding_df = padding_df.select(df.columns)
+
+        result = pl.concat([df, padding_df], how="vertical")
+
+        total_frames = result.select(Group.BY_FRAME).unique().height
+
         frame_completeness = (
-            result.group_by(['game_id', 'period_id', 'frame_id'])
-            .agg([
-                (pl.col('team_id').eq("ball").sum() == 1).alias('has_ball'),
-                (pl.col('team_id').eq(pl.col('ball_owning_team_id')).sum() == 11).alias('has_owning_team'),
-                ((~pl.col('team_id').eq("ball") & ~pl.col('team_id').eq(pl.col('ball_owning_team_id'))).sum() == 11).alias('has_other_team')
-            ])
+            result.group_by(Group.BY_FRAME)
+            .agg(
+                [
+                    (pl.col(Column.TEAM_ID).eq(Constant.BALL).sum() == 1).alias(
+                        "has_ball"
+                    ),
+                    (
+                        pl.col(Column.TEAM_ID)
+                        .eq(pl.col(Column.BALL_OWNING_TEAM_ID))
+                        .sum()
+                        == 11
+                    ).alias("has_owning_team"),
+                    (
+                        (
+                            ~pl.col(Column.TEAM_ID).eq(Constant.BALL)
+                            & ~pl.col(Column.TEAM_ID).eq(
+                                pl.col(Column.BALL_OWNING_TEAM_ID)
+                            )
+                        ).sum()
+                        == 11
+                    ).alias("has_other_team"),
+                ]
+            )
             .filter(
-                pl.col('has_ball') & pl.col('has_owning_team') & pl.col('has_other_team')
+                pl.col("has_ball")
+                & pl.col("has_owning_team")
+                & pl.col("has_other_team")
             )
         )
-        
+
         complete_frames = frame_completeness.height
-        
+
         dropped_frames = total_frames - complete_frames
         if dropped_frames > 0:
             import warnings
+
             warnings.warn(
                 f"""Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.
                 This operation dropped {dropped_frames} incomplete frames out of {total_frames} total frames ({(dropped_frames/total_frames)*100:.2f}%)
                 """
             )
-        
-        return result.join(
-            frame_completeness,
-            on=['game_id', 'period_id', 'frame_id'],
-            how='inner'
-        )
+
+        return result.join(frame_completeness, on=Group.BY_FRAME, how="inner")
 
     def _apply_filters(self):
         return self.dataset.with_columns(
             pl.when(
-                (pl.col(self.settings._identifier_column) == self.settings.ball_id)
-                & (pl.col("v") > self.settings.max_ball_speed)
+                (pl.col(Column.OBJECT_ID) == Constant.BALL)
+                & (pl.col(Column.V) > self.settings.max_ball_speed)
             )
             .then(self.settings.max_ball_speed)
             .when(
-                (pl.col(self.settings._identifier_column) != self.settings.ball_id)
-                & (pl.col("v") > self.settings.max_player_speed)
+                (pl.col(Column.OBJECT_ID) != Constant.BALL)
+                & (pl.col(Column.V) > self.settings.max_player_speed)
             )
             .then(self.settings.max_player_speed)
-            .otherwise(pl.col("v"))
-            .alias("v")
+            .otherwise(pl.col(Column.V))
+            .alias(Column.V)
         ).with_columns(
             pl.when(
-                (pl.col(self.settings._identifier_column) == self.settings.ball_id)
-                & (pl.col("a") > self.settings.max_ball_acceleration)
+                (pl.col(Column.OBJECT_ID) == Constant.BALL)
+                & (pl.col(Column.A) > self.settings.max_ball_acceleration)
             )
             .then(self.settings.max_ball_acceleration)
             .when(
-                (pl.col(self.settings._identifier_column) != self.settings.ball_id)
-                & (pl.col("a") > self.settings.max_player_acceleration)
+                (pl.col(Column.OBJECT_ID) != Constant.BALL)
+                & (pl.col(Column.A) > self.settings.max_player_acceleration)
             )
             .then(self.settings.max_player_acceleration)
-            .otherwise(pl.col("a"))
-            .alias("a")
+            .otherwise(pl.col(Column.A))
+            .alias(Column.A)
         )
 
     def _apply_settings(self):
         return GraphSettingsPolars(
             pitch_dimensions=self.pitch_dimensions,
-            ball_carrier_treshold=self.ball_carrier_threshold,
             max_player_speed=self.max_player_speed,
             max_ball_speed=self.max_ball_speed,
             max_player_acceleration=self.max_player_acceleration,
@@ -249,73 +263,83 @@ def _sport_specific_checks(self):
                 "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
             )
 
-        if self.ball_carrier_threshold and not isinstance(
-            self.ball_carrier_threshold, float
-        ):
-            raise Exception("'ball_carrier_threshold' should be of type float")
-
         if self.non_potential_receiver_node_value and not isinstance(
             self.non_potential_receiver_node_value, float
         ):
             raise Exception(
                 "'non_potential_receiver_node_value' should be of type float"
             )
-            
+
     @property
     def __exprs_variables(self):
         return [
-            "x", "y", "z",
-            "v", "vx", "vy", "vz",
-            "a", "ax", "ay", "az",
-            "team_id", "position_name", "ball_owning_team_id",
+            Column.X,
+            Column.Y,
+            Column.Z,
+            Column.V,
+            Column.VX,
+            Column.VY,
+            Column.VZ,
+            Column.A,
+            Column.AX,
+            Column.AY,
+            Column.AZ,
+            Column.TEAM_ID,
+            Column.POSITION_NAME,
+            Column.BALL_OWNING_TEAM_ID,
+            Column.IS_BALL_CARRIER,
             self.graph_id_col,
             self.label_col,
         ]
-    
+
     def __compute(self, args: List[pl.Series]) -> dict:
         d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
-        
+
         if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]):
             raise Exception(
                 "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
 
-        if not self.prediction and not np.all(d[self.label_col] == d[self.label_col][0]):
+        if not self.prediction and not np.all(
+            d[self.label_col] == d[self.label_col][0]
+        ):
             raise Exception(
                 """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
                 make sure this is not the case. Each group can only have 1 label."""
             )
-        
-        ball_carrier_idx = get_ball_carrier_idx(
-            x=d['x'], y=d['y'], z=d['z'],
-            team=d['team_id'],
-            possession_team=d['ball_owning_team_id'],
-            ball_id=self.settings.ball_id,
-            threshold=self.settings.ball_carrier_treshold,
-        )
+        ball_carriers = np.where(d[Column.IS_BALL_CARRIER] == True)[0]
+        if len(ball_carriers) == 0:
+            ball_carrier_idx = None
+        else:
+            ball_carrier_idx = ball_carriers[0]
+
         adjacency_matrix = compute_adjacency_matrix_pl(
-            team=d['team_id'],
-            possession_team=d['ball_owning_team_id'],
+            team=d[Column.TEAM_ID],
+            ball_owning_team=d[Column.BALL_OWNING_TEAM_ID],
             settings=self.settings,
             ball_carrier_idx=ball_carrier_idx,
         )
+
+        velocity = np.stack((d[Column.VX], d[Column.VY]), axis=-1)
         edge_features = compute_edge_features_pl(
             adjacency_matrix=adjacency_matrix,
-            p3d=np.stack((d['x'], d['y'], d['z']), axis=-1),
-            p2d=np.stack((d['x'], d['y']), axis=-1),
-            s=d['v'],
-            velocity=np.stack((d['vx'], d['vy']), axis=-1),
-            team=d['team_id'],
+            p3d=np.stack((d[Column.X], d[Column.Y], d[Column.Z]), axis=-1),
+            p2d=np.stack((d[Column.X], d[Column.Y]), axis=-1),
+            s=d[Column.V],
+            velocity=velocity,
+            team=d[Column.TEAM_ID],
             settings=self.settings,
         )
+
         node_features = compute_node_features_pl(
-            d['x'],
-            d['y'],
-            s=d['v'],
-            velocity=np.stack((d['vx'], d['vy']), axis=-1),
-            team=d['team_id'],
-            possession_team=d['ball_owning_team_id'],
-            is_gk=(d['position_name'] == self.settings.goalkeeper_id).astype(int),
+            d[Column.X],
+            d[Column.Y],
+            s=d[Column.V],
+            velocity=velocity,
+            team=d[Column.TEAM_ID],
+            possession_team=d[Column.BALL_OWNING_TEAM_ID],
+            is_gk=(d[Column.POSITION_NAME] == self.settings.goalkeeper_id).astype(int),
+            ball_carrier=d[Column.IS_BALL_CARRIER],
             settings=self.settings,
         )
         return {
@@ -337,11 +361,9 @@ def __compute(self, args: List[pl.Series]) -> dict:
             self.graph_id_col: d[self.graph_id_col][0],
             self.label_col: d[self.label_col][0],
         }
-    
+
     def _convert(self):
-        result_df = self.dataset.group_by(
-            ["game_id", "frame_id"], maintain_order=True
-        ).agg(
+        result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg(
             pl.map_groups(
                 exprs=self.__exprs_variables,
                 function=self.__compute,
@@ -369,8 +391,6 @@ def _convert(self):
         )
 
         return graph_df.drop("result_dict")
-    
-    
 
     def to_graph_frames(self) -> List[dict]:
         def __convert_to_graph_data_list(df):
@@ -406,10 +426,10 @@ def __convert_to_graph_data_list(df):
                 graph_list.extend(chunk_graph_list)
 
             return graph_list
-        
+
         graph_df = self._convert()
-        self.graph_frames = self.__convert_to_graph_data_list(graph_df)
-        
+        self.graph_frames = __convert_to_graph_data_list(graph_df)
+
         return self.graph_frames
 
     def to_spektral_graphs(self) -> List[Graph]:
diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py
index 4e934a9..a7713f4 100644
--- a/unravel/soccer/graphs/graph_settings_pl.py
+++ b/unravel/soccer/graphs/graph_settings_pl.py
@@ -6,10 +6,12 @@
 from kloppy.domain import Dimension, Unit, MetricPitchDimensions
 from typing import Optional
 
+from .dataset import Constant
+
 
 @dataclass
 class GraphSettingsPolars(DefaultGraphSettings):
-    ball_id: str = "ball"
+    ball_id: str = Constant.BALL
     goalkeeper_id: str = "GK"
     boundary_correction: float = None
     non_potential_receiver_node_value: float = 0.1
@@ -17,7 +19,6 @@ class GraphSettingsPolars(DefaultGraphSettings):
     pitch_dimensions: MetricPitchDimensions = field(
         init=False, repr=False, default_factory=MetricPitchDimensions
     )
-    _identifier_column: str = field(default="id", init=False)
 
     def __post_init__(self):
         self._sport_specific_checks()
diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py
index 282fe61..c11e8f3 100644
--- a/unravel/utils/features/utils.py
+++ b/unravel/utils/features/utils.py
@@ -204,7 +204,7 @@ def distance_to_ball(
 
 def get_ball_carrier_idx(x, y, z, team, possession_team, ball_id, threshold):
     _, _, dist_to_ball = distance_to_ball(x=x, y=y, z=z, team=team, ball_id=ball_id)
-
+    print(dist_to_ball)
     filtered_distances = np.where(
         (team != possession_team) | (dist_to_ball <= threshold), np.inf, dist_to_ball
     )

From bd2a63687ba29e01c7df6bb1e91b415adddf244d Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 12:13:32 +0100
Subject: [PATCH 04/10] Qhull error

---
 unravel/soccer/graphs/graph_converter.py    |  23 ++-
 unravel/soccer/graphs/graph_converter_pl.py | 204 ++++++++++++++------
 2 files changed, 155 insertions(+), 72 deletions(-)

diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py
index 4eded6e..31c093b 100644
--- a/unravel/soccer/graphs/graph_converter.py
+++ b/unravel/soccer/graphs/graph_converter.py
@@ -2,6 +2,8 @@
 import sys
 from copy import deepcopy
 
+from scipy.spatial.qhull import QhullError
+
 import warnings
 
 from dataclasses import dataclass, field, asdict
@@ -238,15 +240,18 @@ def to_graph_frames(self) -> dict:
             for frame in tqdm(self.dataset, desc="Processing frames"):
                 data, label, frame_id, graph_id = self._convert(frame)
                 if data.home_players and data.away_players:
-                    gnn_frame = GraphFrame(
-                        frame_id=frame_id,
-                        data=data,
-                        label=label,
-                        graph_id=graph_id,
-                        settings=self.settings,
-                    )
-                    if gnn_frame.graph_data:
-                        self.graph_frames.append(gnn_frame)
+                    try:
+                        gnn_frame = GraphFrame(
+                            frame_id=frame_id,
+                            data=data,
+                            label=label,
+                            graph_id=graph_id,
+                            settings=self.settings,
+                        )
+                        if gnn_frame.graph_data:
+                            self.graph_frames.append(gnn_frame)
+                    except QhullError:
+                        pass
 
         return self.graph_frames
 
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index c1f835f..077e296 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -73,9 +73,23 @@ def __post_init__(self):
         self.dataset = self._apply_filters()
 
         if self.pad:
-            self.dataset = self._apply_padding(df=self.dataset)
+            self.dataset = self._apply_padding()
+
+        self._shuffle()
+
+    def _shuffle(self):
+        if isinstance(self.settings.random_seed, int):
+            self.dataset = self.dataset.sample(
+                fraction=1.0, seed=self.settings.random_seed
+            )
+        elif self.settings.random_seed == True:
+            self.dataset = self.dataset.sample(fraction=1.0)
+        else:
+            pass
+
+    def _apply_padding(self) -> pl.DataFrame:
+        df = self.dataset
 
-    def _apply_padding(self, df: pl.DataFrame) -> pl.DataFrame:
         keep_columns = [
             Column.TIMESTAMP,
             Column.BALL_STATE,
@@ -342,6 +356,7 @@ def __compute(self, args: List[pl.Series]) -> dict:
             ball_carrier=d[Column.IS_BALL_CARRIER],
             settings=self.settings,
         )
+
         return {
             "e": pl.Series(
                 [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
@@ -362,76 +377,139 @@ def __compute(self, args: List[pl.Series]) -> dict:
             self.label_col: d[self.label_col][0],
         }
 
+    # def _convert(self):
+    #     result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg(
+    #         pl.map_groups(
+    #             exprs=self.__exprs_variables,
+    #             function=self.__compute,
+    #         ).alias("result_dict")
+    #     )
+
+    #     graph_df = result_df.with_columns(
+    #         [
+    #             pl.col("result_dict").struct.field("a").alias("a"),
+    #             pl.col("result_dict").struct.field("e").alias("e"),
+    #             pl.col("result_dict").struct.field("x").alias("x"),
+    #             pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"),
+    #             pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"),
+    #             pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"),
+    #             pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"),
+    #             pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"),
+    #             pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"),
+    #             pl.col("result_dict")
+    #             .struct.field(self.graph_id_col)
+    #             .alias(self.graph_id_col),
+    #             pl.col("result_dict")
+    #             .struct.field(self.label_col)
+    #             .alias(self.label_col),
+    #         ]
+    #     )
+
+    #     return graph_df.drop("result_dict")
+
+    # def to_graph_frames(self) -> List[dict]:
+    #     def __convert_to_graph_data_list(df):
+    #         lazy_df = df.lazy()
+
+    #         graph_list = []
+
+    #         for chunk in lazy_df.collect().iter_slices(self.chunk_size):
+    #             chunk_graph_list = [
+    #                 {
+    #                     "a": make_sparse(
+    #                         flatten_to_reshaped_array(
+    #                             arr=chunk["a"][i],
+    #                             s0=chunk["a_shape_0"][i],
+    #                             s1=chunk["a_shape_1"][i],
+    #                         )
+    #                     ),
+    #                     "x": flatten_to_reshaped_array(
+    #                         arr=chunk["x"][i],
+    #                         s0=chunk["x_shape_0"][i],
+    #                         s1=chunk["x_shape_1"][i],
+    #                     ),
+    #                     "e": flatten_to_reshaped_array(
+    #                         arr=chunk["e"][i],
+    #                         s0=chunk["e_shape_0"][i],
+    #                         s1=chunk["e_shape_1"][i],
+    #                     ),
+    #                     "y": np.asarray([chunk[self.label_col][i]]),
+    #                     "id": chunk[self.graph_id_col][i],
+    #                 }
+    #                 for i in range(len(chunk["a"]))
+    #             ]
+    #             graph_list.extend(chunk_graph_list)
+
+    #         return graph_list
+
+    #     graph_df = self._convert()
+    #     self.graph_frames = __convert_to_graph_data_list(graph_df)
+
+    #     return self.graph_frames
+
+    ###
     def _convert(self):
-        result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg(
-            pl.map_groups(
-                exprs=self.__exprs_variables,
-                function=self.__compute,
-            ).alias("result_dict")
-        )
-
-        graph_df = result_df.with_columns(
-            [
-                pl.col("result_dict").struct.field("a").alias("a"),
-                pl.col("result_dict").struct.field("e").alias("e"),
-                pl.col("result_dict").struct.field("x").alias("x"),
-                pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"),
-                pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"),
-                pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"),
-                pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"),
-                pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"),
-                pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"),
-                pl.col("result_dict")
-                .struct.field(self.graph_id_col)
-                .alias(self.graph_id_col),
-                pl.col("result_dict")
-                .struct.field(self.label_col)
-                .alias(self.label_col),
-            ]
+        # Group and aggregate in one step
+        return (
+            self.dataset.group_by(Group.BY_FRAME, maintain_order=True)
+            .agg(
+                pl.map_groups(
+                    exprs=self.__exprs_variables, function=self.__compute
+                ).alias("result_dict")
+            )
+            .with_columns(
+                [
+                    *[
+                        pl.col("result_dict").struct.field(f).alias(f)
+                        for f in ["a", "e", "x", self.graph_id_col, self.label_col]
+                    ],
+                    *[
+                        pl.col("result_dict")
+                        .struct.field(f"{m}_shape_{i}")
+                        .alias(f"{m}_shape_{i}")
+                        for m in ["a", "e", "x"]
+                        for i in [0, 1]
+                    ],
+                ]
+            )
+            .drop("result_dict")
         )
 
-        return graph_df.drop("result_dict")
+    @staticmethod
+    def _reshape_array(arr, s0, s1):
+        return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1)
 
     def to_graph_frames(self) -> List[dict]:
-        def __convert_to_graph_data_list(df):
-            lazy_df = df.lazy()
-
-            graph_list = []
-
-            for chunk in lazy_df.collect().iter_slices(self.chunk_size):
-                chunk_graph_list = [
-                    {
-                        "a": make_sparse(
-                            flatten_to_reshaped_array(
-                                arr=chunk["a"][i],
-                                s0=chunk["a_shape_0"][i],
-                                s1=chunk["a_shape_1"][i],
-                            )
-                        ),
-                        "x": flatten_to_reshaped_array(
-                            arr=chunk["x"][i],
-                            s0=chunk["x_shape_0"][i],
-                            s1=chunk["x_shape_1"][i],
-                        ),
-                        "e": flatten_to_reshaped_array(
-                            arr=chunk["e"][i],
-                            s0=chunk["e_shape_0"][i],
-                            s1=chunk["e_shape_1"][i],
-                        ),
-                        "y": np.asarray([chunk[self.label_col][i]]),
-                        "id": chunk[self.graph_id_col][i],
-                    }
-                    for i in range(len(chunk["a"]))
-                ]
-                graph_list.extend(chunk_graph_list)
-
-            return graph_list
+        def process_chunk(chunk: pl.DataFrame) -> List[dict]:
+            return [
+                {
+                    "a": make_sparse(
+                        self._reshape_array(
+                            chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i]
+                        )
+                    ),
+                    "x": self._reshape_array(
+                        chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i]
+                    ),
+                    "e": self._reshape_array(
+                        chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i]
+                    ),
+                    "y": np.asarray([chunk[self.label_col][i]]),
+                    "id": chunk[self.graph_id_col][i],
+                }
+                for i in range(len(chunk))
+            ]
 
         graph_df = self._convert()
-        self.graph_frames = __convert_to_graph_data_list(graph_df)
-
+        self.graph_frames = [
+            graph
+            for chunk in graph_df.lazy().collect().iter_slices(self.chunk_size)
+            for graph in process_chunk(chunk)
+        ]
         return self.graph_frames
 
+    ###
+
     def to_spektral_graphs(self) -> List[Graph]:
         if not self.graph_frames:
             self.to_graph_frames()

From 310816d9fd3e050bb1765db805e5c24ce4140101 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 13:25:27 +0100
Subject: [PATCH 05/10] reworked NFL

---
 tests/test_bigdb.py                           |  17 +-
 tests/test_kloppy_polars.py                   |  27 --
 tests/test_spektral.py                        |   8 +-
 unravel/american_football/graphs/dataset.py   | 152 ++++++----
 .../graphs/features/adjacency_matrix.py       |   3 +-
 .../graphs/features/edge_features.py          |   3 +
 .../graphs/features/node_features.py          |  12 +-
 .../graphs/graph_converter.py                 | 287 ++++++++----------
 .../graphs/graph_settings.py                  |   2 -
 unravel/soccer/graphs/dataset.py              |  28 +-
 .../graphs/features/edge_features_pl.py       |   3 +
 .../graphs/features/node_features_pl.py       |   5 +-
 unravel/soccer/graphs/graph_converter_pl.py   | 132 ++------
 unravel/utils/features/utils.py               |  16 +-
 unravel/utils/objects/default_dataset.py      |   5 +-
 .../utils/objects/default_graph_converter.py  |   3 +
 16 files changed, 309 insertions(+), 394 deletions(-)

diff --git a/tests/test_bigdb.py b/tests/test_bigdb.py
index adedd3e..eb2caca 100644
--- a/tests/test_bigdb.py
+++ b/tests/test_bigdb.py
@@ -20,9 +20,8 @@
     AmericanFootballGraphConverter,
     AmericanFootballPitchDimensions,
 )
+from unravel.american_football.graphs.dataset import Constant
 from unravel.utils import (
-    add_graph_id_column,
-    add_dummy_label_column,
     flatten_to_reshaped_array,
     make_sparse,
     CustomSpektralDataset,
@@ -53,10 +52,8 @@ def dataset(self, coordinates: str, players: str, plays: str):
             plays_file_path=plays,
         )
         bdb_dataset.load()
-        bdb_dataset.add_graph_ids(by=["gameId", "playId"], column_name="graph_id")
-        bdb_dataset.add_dummy_labels(
-            by=["gameId", "playId", "frameId"], column_name="label"
-        )
+        bdb_dataset.add_graph_ids(by=["gameId", "playId"])
+        bdb_dataset.add_dummy_labels(by=["gameId", "playId", "frameId"])
         return bdb_dataset
 
     @pytest.fixture
@@ -141,8 +138,6 @@ def node_feature_values(self):
     @pytest.fixture
     def arguments(self):
         return dict(
-            label_col="label",
-            graph_id_col="graph_id",
             max_player_speed=8.0,
             max_ball_speed=28.0,
             max_player_acceleration=10.0,
@@ -161,8 +156,6 @@ def arguments(self):
     @pytest.fixture
     def non_default_arguments(self):
         return dict(
-            label_col="label",
-            graph_id_col="graph_id",
             max_player_speed=12.0,
             max_ball_speed=24.0,
             max_player_acceleration=11.0,
@@ -199,8 +192,8 @@ def test_settings(self, gnnc_non_default, non_default_arguments):
         assert settings.pitch_dimensions.y_dim.min == -26.65
         assert settings.pitch_dimensions.end_zone == 50.0
 
-        assert settings.ball_id == "football"
-        assert settings.qb_id == "QB"
+        assert Constant.BALL == "football"
+        assert Constant.QB == "QB"
         assert settings.max_height == 225.0
         assert settings.min_height == 150.0
         assert settings.max_weight == 200.0
diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py
index 4d70d2a..df31b1a 100644
--- a/tests/test_kloppy_polars.py
+++ b/tests/test_kloppy_polars.py
@@ -75,11 +75,6 @@ def spc_padding(
     def soccer_polars_converter(
         self, kloppy_polars_dataset: KloppyPolarsDataset
     ) -> SoccerGraphConverterPolars:
-        # TODO:
-        # check if
-        # - random_seed
-        # - padding needs to be per team_id otherwise stuff breaks
-        # all work as expected and/or should be moved to the KloppyPolarsDataset
 
         return SoccerGraphConverterPolars(
             dataset=kloppy_polars_dataset,
@@ -99,28 +94,6 @@ def soccer_polars_converter(
             verbose=False,
         )
 
-    # @pytest.fixture()
-    # def gnnc_padding_random(self, dataset: TrackingDataset) -> SoccerGraphConverter:
-    #     return SoccerGraphConverter(
-    #         dataset=dataset,
-    #         labels=dummy_labels(dataset),
-    #         # settings
-    #         ball_carrier_treshold=25.0,
-    #         max_player_speed=12.0,
-    #         max_ball_speed=28.0,
-    #         boundary_correction=None,
-    #         self_loop_ball=False,
-    #         adjacency_matrix_connect_type="ball",
-    #         adjacency_matrix_type="split_by_team",
-    #         label_type="binary",
-    #         defending_team_node_value=0.0,
-    #         non_potential_receiver_node_value=0.1,
-    #         infer_ball_ownership=True,
-    #         infer_goalkeepers=True,
-    #         random_seed=42,
-    #         pad=True,
-    #         verbose=False,
-    #     )
     def test_padding(self, spc_padding: SoccerGraphConverterPolars):
         spektral_graphs = spc_padding.to_spektral_graphs()
 
diff --git a/tests/test_spektral.py b/tests/test_spektral.py
index b170970..6e14ae4 100644
--- a/tests/test_spektral.py
+++ b/tests/test_spektral.py
@@ -45,10 +45,8 @@ def bdb_dataset(self, coordinates: str, players: str, plays: str):
             plays_file_path=plays,
         )
         bdb_dataset.load()
-        bdb_dataset.add_graph_ids(by=["gameId", "playId"], column_name="graph_id")
-        bdb_dataset.add_dummy_labels(
-            by=["gameId", "playId", "frameId"], column_name="label"
-        )
+        bdb_dataset.add_graph_ids(by=["gameId", "playId"])
+        bdb_dataset.add_dummy_labels(by=["gameId", "playId", "frameId"])
         return bdb_dataset
 
     @pytest.fixture
@@ -122,8 +120,6 @@ def bdb_converter(
     ) -> AmericanFootballGraphConverter:
         return AmericanFootballGraphConverter(
             dataset=bdb_dataset,
-            label_col="label",
-            graph_id_col="graph_id",
             max_player_speed=8.0,
             max_ball_speed=28.0,
             max_player_acceleration=10.0,
diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py
index fdb7310..4b8ccff 100644
--- a/unravel/american_football/graphs/dataset.py
+++ b/unravel/american_football/graphs/dataset.py
@@ -10,22 +10,52 @@
 from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column
 
 
+class Constant:
+    BALL = "football"
+    QB = "QB"
+
+
+class Column:
+    OBJECT_ID = "nflId"
+
+    GAME_ID = "gameId"
+    FRAME_ID = "frameId"
+    PLAY_ID = "playId"
+
+    X = "x"
+    Y = "y"
+
+    ACCELERATION = "a"
+    SPEED = "s"
+    ORIENTATION = "o"
+    DIRECTION = "dir"
+    TEAM = "team"
+    CLUB = "club"
+    OFFICIAL_POSITION = "officialPosition"
+    POSSESSION_TEAM = "possessionTeam"
+    HEIGHT_CM = "height_cm"
+    WEIGHT_KG = "weight_kg"
+
+
+class Group:
+    BY_FRAME = [Column.GAME_ID, Column.PLAY_ID, Column.FRAME_ID]
+    BY_PLAY_POSSESSION_TEAM = [Column.GAME_ID, Column.PLAY_ID, Column.POSSESSION_TEAM]
+
+
 @dataclass
 class BigDataBowlDataset(DefaultDataset):
-    tracking_file_path: str
-    players_file_path: str
-    plays_file_path: str
-    pitch_dimensions: AmericanFootballPitchDimensions = field(
-        init=False, repr=False, default_factory=AmericanFootballPitchDimensions
-    )
-
-    def __post_init__(self):
-        if (
-            not self.tracking_file_path
-            or not self.players_file_path
-            or not self.plays_file_path
-        ):
-            raise Exception("Missing data file path...")
+    def __init__(
+        self,
+        tracking_file_path: str,
+        players_file_path: str,
+        plays_file_path: str,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tracking_file_path = tracking_file_path
+        self.players_file_path = players_file_path
+        self.plays_file_path = plays_file_path
+        self.pitch_dimensions = AmericanFootballPitchDimensions()
 
     def load(self):
         pitch_length = self.pitch_dimensions.pitch_length
@@ -42,48 +72,51 @@ def load(self):
         play_direction = "left"
 
         if "club" in df.columns:
-            df = df.with_columns(pl.col("club").alias("team"))
-            df = df.drop("club")
+            df = df.with_columns(pl.col(Column.CLUB).alias(Column.TEAM))
+            df = df.drop(Column.CLUB)
 
         df = (
             df.with_columns(
                 pl.when(pl.col("playDirection") == play_direction)
-                .then(pl.col("o") + 180)  # rotate 180 degrees
-                .otherwise(pl.col("o"))
-                .alias("o"),
+                .then(pl.col(Column.ORIENTATION) + 180)  # rotate 180 degrees
+                .otherwise(pl.col(Column.ORIENTATION))
+                .alias(Column.ORIENTATION),
                 pl.when(pl.col("playDirection") == play_direction)
-                .then(pl.col("dir") + 180)  # rotate 180 degrees
-                .otherwise(pl.col("dir"))
-                .alias("dir"),
+                .then(pl.col(Column.DIRECTION) + 180)  # rotate 180 degrees
+                .otherwise(pl.col(Column.DIRECTION))
+                .alias(Column.DIRECTION),
             )
             .with_columns(
                 [
-                    (pl.col("x") - (pitch_length / 2)).alias("x"),
-                    (pl.col("y") - (pitch_width / 2)).alias("y"),
+                    (pl.col(Column.X) - (pitch_length / 2)).alias(Column.X),
+                    (pl.col(Column.Y) - (pitch_width / 2)).alias(Column.Y),
                     # convert to radian on (-pi, pi) range
-                    (((pl.col("o") * np.pi / 180) + np.pi) % (2 * np.pi) - np.pi).alias(
-                        "o"
-                    ),
                     (
-                        ((pl.col("dir") * np.pi / 180) + np.pi) % (2 * np.pi) - np.pi
-                    ).alias("dir"),
+                        ((pl.col(Column.ORIENTATION) * np.pi / 180) + np.pi)
+                        % (2 * np.pi)
+                        - np.pi
+                    ).alias(Column.ORIENTATION),
+                    (
+                        ((pl.col(Column.DIRECTION) * np.pi / 180) + np.pi) % (2 * np.pi)
+                        - np.pi
+                    ).alias(Column.DIRECTION),
                 ]
             )
             .with_columns(
                 [
                     pl.when(pl.col("playDirection") == play_direction)
-                    .then(pl.col("x") * -1.0)
-                    .otherwise(pl.col("x"))
-                    .alias("x"),
+                    .then(pl.col(Column.X) * -1.0)
+                    .otherwise(pl.col(Column.X))
+                    .alias(Column.X),
                     pl.when(pl.col("playDirection") == play_direction)
-                    .then(pl.col("y") * -1.0)
-                    .otherwise(pl.col("y"))
-                    .alias("y"),
+                    .then(pl.col(Column.Y) * -1.0)
+                    .otherwise(pl.col(Column.Y))
+                    .alias(Column.Y),
                     # set "football" to nflId -9999 for ordering purposes
-                    pl.when(pl.col("team") == "football")
+                    pl.when(pl.col(Column.TEAM) == Constant.BALL)
                     .then(-9999.9)
-                    .otherwise(pl.col("nflId"))
-                    .alias("nflId"),
+                    .otherwise(pl.col(Column.OBJECT_ID))
+                    .alias(Column.OBJECT_ID),
                 ]
             )
         )
@@ -96,11 +129,15 @@ def load(self):
             ignore_errors=True,
         )
         if "position" in players.columns:
-            players = players.with_columns(pl.col("position").alias("officialPosition"))
+            players = players.with_columns(
+                pl.col("position").alias(Column.OFFICIAL_POSITION)
+            )
             players = players.drop("position")
 
         players = players.with_columns(
-            pl.col("nflId").cast(pl.Float64, strict=False).alias("nflId")
+            pl.col(Column.OBJECT_ID)
+            .cast(pl.Float64, strict=False)
+            .alias(Column.OBJECT_ID)
         )
         players = self._convert_weight_height_to_metric(df=players)
 
@@ -113,13 +150,22 @@ def load(self):
         )
 
         df = df.join(
-            (players.select(["nflId", "officialPosition", "height_cm", "weight_kg"])),
-            on="nflId",
+            (
+                players.select(
+                    [
+                        Column.OBJECT_ID,
+                        Column.OFFICIAL_POSITION,
+                        Column.HEIGHT_CM,
+                        Column.WEIGHT_KG,
+                    ]
+                )
+            ),
+            on=Column.OBJECT_ID,
             how="left",
         )
         df = df.join(
-            (plays.select(["gameId", "playId", "possessionTeam"])),
-            on=["gameId", "playId"],
+            (plays.select(Group.BY_PLAY_POSSESSION_TEAM)),
+            on=[Column.GAME_ID, Column.PLAY_ID],
             how="left",
         )
         self.data = df
@@ -137,17 +183,13 @@ def load(self):
         return self.data, self.pitch_dimensions
 
     def add_dummy_labels(
-        self,
-        by: List[str] = ["gameId", "playId", "frameId"],
-        column_name: str = "label",
+        self, by: List[str] = ["gameId", "playId", "frameId"]
     ) -> pl.DataFrame:
-        self.data = add_dummy_label_column(self.data, by, column_name)
+        self.data = add_dummy_label_column(self.data, by, self._label_column)
         return self.data
 
-    def add_graph_ids(
-        self, by: List[str] = ["gameId", "playId"], column_name: str = "graph_id"
-    ) -> pl.DataFrame:
-        self.data = add_graph_id_column(self.data, by, column_name)
+    def add_graph_ids(self, by: List[str] = ["gameId", "playId"]) -> pl.DataFrame:
+        self.data = add_graph_id_column(self.data, by, self._graph_id_column)
         return self.data
 
     @staticmethod
@@ -166,9 +208,11 @@ def _convert_weight_height_to_metric(df: pl.DataFrame):
         )
         df = df.with_columns(
             [
-                (pl.col("feet") * 30.48 + pl.col("inches") * 2.54).alias("height_cm"),
+                (pl.col("feet") * 30.48 + pl.col("inches") * 2.54).alias(
+                    Column.HEIGHT_CM
+                ),
                 (pl.col("weight") * 0.453592).alias(
-                    "weight_kg"
+                    Column.WEIGHT_KG
                 ),  # Convert pounds to kilograms
             ]
         ).drop(["height", "feet", "inches", "weight"])
diff --git a/unravel/american_football/graphs/features/adjacency_matrix.py b/unravel/american_football/graphs/features/adjacency_matrix.py
index 130cd0f..4272871 100644
--- a/unravel/american_football/graphs/features/adjacency_matrix.py
+++ b/unravel/american_football/graphs/features/adjacency_matrix.py
@@ -1,12 +1,13 @@
 import numpy as np
 
 from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType
+from ..dataset import Constant
 
 
 def compute_adjacency_matrix(team, possession_team, settings):
     adjacency_matrix_type = settings.adjacency_matrix_type
     adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type
-    ball_id = settings.ball_id
+    ball_id = Constant.BALL
 
     exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)])
     defensive_team = np.setdiff1d(team, exclusion_ids)[0]
diff --git a/unravel/american_football/graphs/features/edge_features.py b/unravel/american_football/graphs/features/edge_features.py
index 7ff3081..78f491c 100644
--- a/unravel/american_football/graphs/features/edge_features.py
+++ b/unravel/american_football/graphs/features/edge_features.py
@@ -8,6 +8,7 @@
     normalize_speed_differences_nfl,
     normalize_accelerations_nfl,
 )
+from ..dataset import Constant
 
 
 def compute_edge_features(adjacency_matrix, p, s, a, o, dir, team, settings):
@@ -26,12 +27,14 @@ def compute_edge_features(adjacency_matrix, p, s, a, o, dir, team, settings):
     speed_diff_matrix_normed = normalize_speed_differences_nfl(
         s=speed_diff_matrix,
         team=team,
+        ball_id=Constant.BALL,
         settings=settings,
     )
     acc_diff_matrix = np.nan_to_num(a[None, :] - a[:, None])  # NxNx1
     acc_diff_matrix_normed = normalize_accelerations_nfl(
         a=acc_diff_matrix,
         team=team,
+        ball_id=Constant.BALL,
         settings=settings,
     )
     vect_to_player_matrix = p[:, None, :] - p[None, :, :]  # NxNx2
diff --git a/unravel/american_football/graphs/features/node_features.py b/unravel/american_football/graphs/features/node_features.py
index c723e21..dbf74f2 100644
--- a/unravel/american_football/graphs/features/node_features.py
+++ b/unravel/american_football/graphs/features/node_features.py
@@ -12,6 +12,8 @@
     normalize_between,
 )
 
+from ..dataset import Constant
+
 
 def compute_node_features(
     x,
@@ -27,7 +29,7 @@ def compute_node_features(
     weight,
     settings,
 ):
-    ball_id = settings.ball_id
+    ball_id = Constant.BALL
 
     goal_mouth_position = (
         settings.pitch_dimensions.x_dim.max,
@@ -61,10 +63,12 @@ def compute_node_features(
         min_value=settings.pitch_dimensions.y_dim.min,
     )
     uv_sa = unit_vector_from_angle(value=s, angle_radians=dir)
-    s_normed = normalize_speeds_nfl(s, team, settings)
+    s_normed = normalize_speeds_nfl(s, team, ball_id=Constant.BALL, settings=settings)
 
     uv_aa = unit_vector_from_angle(value=a, angle_radians=dir)
-    a_normed = normalize_accelerations_nfl(a, team, settings)
+    a_normed = normalize_accelerations_nfl(
+        a, team, ball_id=Constant.BALL, settings=settings
+    )
 
     dir_sin_normed = normalize_sincos(np.nan_to_num(np.sin(dir)))
     dir_cos_normed = normalize_sincos(np.nan_to_num(np.cos(dir)))
@@ -92,7 +96,7 @@ def compute_node_features(
         team == possession_team, 1, settings.defending_team_node_value
     )
     is_qb = np.where(
-        official_position == settings.qb_id,  # First condition
+        official_position == Constant.QB,  # First condition
         1,  # If true, set to 1 (indicating the player is a QB)
         np.where(
             team == possession_team,  # Second condition inside the else of the first
diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py
index 172164d..07b01e1 100644
--- a/unravel/american_football/graphs/graph_converter.py
+++ b/unravel/american_football/graphs/graph_converter.py
@@ -7,7 +7,7 @@
 
 from spektral.data import Graph
 
-from .dataset import BigDataBowlDataset
+from .dataset import BigDataBowlDataset, Group, Column, Constant
 
 from .graph_settings import (
     AmericanFootballGraphSettings,
@@ -19,7 +19,7 @@
     compute_adjacency_matrix,
 )
 
-from ...utils import DefaultGraphConverter, flatten_to_reshaped_array, make_sparse
+from ...utils import DefaultGraphConverter, reshape_array, make_sparse
 
 
 @dataclass(repr=True)
@@ -39,8 +39,6 @@ class AmericanFootballGraphConverter(DefaultGraphConverter):
     def __init__(
         self,
         dataset: BigDataBowlDataset,
-        label_col: str = "label",
-        graph_id_col: str = "graph_id",
         chunk_size: int = 2_000,
         attacking_non_qb_node_value: float = 0.1,
         **kwargs,
@@ -50,12 +48,13 @@ def __init__(
         if not isinstance(dataset, BigDataBowlDataset):
             raise Exception("'dataset' should be an instance of BigDataBowlDataset")
 
+        self.label_col = dataset._label_column
+        self.graph_id_col = dataset._graph_id_column
+
         self.dataset: pl.DataFrame = dataset.data
         self.pitch_dimensions: AmericanFootballPitchDimensions = (
             dataset.pitch_dimensions
         )
-        self.label_col = label_col
-        self.graph_id_col = graph_id_col
         self.chunk_size = chunk_size
         self.attacking_non_qb_node_value = attacking_non_qb_node_value
 
@@ -108,163 +107,143 @@ def _apply_settings(self):
             verbose=self.verbose,
         )
 
-    def _convert(self):
-        def __compute(args: List[pl.Series]) -> dict:
-            x = args[0].to_numpy()
-            y = args[1].to_numpy()
-            s = args[2].to_numpy()
-            a = args[3].to_numpy()
-            dis = args[4].to_numpy()
-            o = args[5].to_numpy()
-            dir = args[6].to_numpy()
-            team = args[7].to_numpy()
-            official_position = args[8].to_numpy()
-            possession_team = args[9].to_numpy()
-            height = args[10].to_numpy()
-            weight = args[11].to_numpy()
-            graph_id = args[12].to_numpy()
-            label = args[13].to_numpy()
-
-            if not np.all(graph_id == graph_id[0]):
-                raise Exception(
-                    "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..."
-                )
-
-            if not np.all(label == label[0]):
-                raise Exception(
-                    "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label."
-                )
-            adjacency_matrix = compute_adjacency_matrix(
-                team=team, possession_team=possession_team, settings=self.settings
-            )
-            edge_features = compute_edge_features(
-                adjacency_matrix=adjacency_matrix,
-                p=np.stack((x, y), axis=-1),
-                s=s,
-                a=a,
-                dir=dir,
-                o=o,  # Shape will be (N, 2)
-                team=team,
-                settings=self.settings,
+    @property
+    def __exprs_variables(self):
+        return [
+            Column.X,
+            Column.Y,
+            Column.SPEED,
+            Column.ACCELERATION,
+            Column.ORIENTATION,
+            Column.DIRECTION,
+            Column.TEAM,
+            Column.OFFICIAL_POSITION,
+            Column.POSSESSION_TEAM,
+            Column.HEIGHT_CM,
+            Column.WEIGHT_KG,
+            self.graph_id_col,
+            self.label_col,
+        ]
+
+    def __compute(self, args: List[pl.Series]) -> dict:
+        d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
+
+        if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]):
+            raise Exception(
+                "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
-            node_features = compute_node_features(
-                x,
-                y,
-                s=s,
-                a=a,
-                dir=dir,
-                o=o,
-                team=team,
-                official_position=official_position,
-                possession_team=possession_team,
-                height=height,
-                weight=weight,
-                settings=self.settings,
+
+        if not self.prediction and not np.all(
+            d[self.label_col] == d[self.label_col][0]
+        ):
+            raise Exception(
+                """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
+                make sure this is not the case. Each group can only have 1 label."""
             )
-            return {
-                "e": pl.Series(
-                    [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
-                ),
-                "x": pl.Series(
-                    [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
-                ),
-                "a": pl.Series(
-                    [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32))
-                ),
-                "e_shape_0": edge_features.shape[0],
-                "e_shape_1": edge_features.shape[1],
-                "x_shape_0": node_features.shape[0],
-                "x_shape_1": node_features.shape[1],
-                "a_shape_0": adjacency_matrix.shape[0],
-                "a_shape_1": adjacency_matrix.shape[1],
-                self.graph_id_col: graph_id[0],
-                self.label_col: label[0],
-            }
-
-        result_df = self.dataset.group_by(
-            ["gameId", "playId", "frameId"], maintain_order=True
-        ).agg(
-            pl.map_groups(
-                exprs=[
-                    "x",
-                    "y",
-                    "s",
-                    "a",
-                    "dis",
-                    "o",
-                    "dir",
-                    "team",
-                    "officialPosition",
-                    "possessionTeam",
-                    "height_cm",
-                    "weight_kg",
-                    self.graph_id_col,
-                    self.label_col,
-                ],
-                function=__compute,
-            ).alias("result_dict")
-        )
 
-        graph_df = result_df.with_columns(
-            [
-                pl.col("result_dict").struct.field("a").alias("a"),
-                pl.col("result_dict").struct.field("e").alias("e"),
-                pl.col("result_dict").struct.field("x").alias("x"),
-                pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"),
-                pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"),
-                pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"),
-                pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"),
-                pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"),
-                pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"),
-                pl.col("result_dict")
-                .struct.field(self.graph_id_col)
-                .alias(self.graph_id_col),
-                pl.col("result_dict")
-                .struct.field(self.label_col)
-                .alias(self.label_col),
-            ]
+        adjacency_matrix = compute_adjacency_matrix(
+            team=d[Column.TEAM],
+            possession_team=d[Column.POSSESSION_TEAM],
+            settings=self.settings,
         )
+        edge_features = compute_edge_features(
+            adjacency_matrix=adjacency_matrix,
+            p=np.stack((d[Column.X], d[Column.Y]), axis=-1),
+            s=d[Column.SPEED],
+            a=d[Column.ACCELERATION],
+            dir=d[Column.DIRECTION],
+            o=d[Column.ORIENTATION],
+            team=d[Column.TEAM],
+            settings=self.settings,
+        )
+        node_features = compute_node_features(
+            x=d[Column.X],
+            y=d[Column.Y],
+            s=d[Column.SPEED],
+            a=d[Column.ACCELERATION],
+            dir=d[Column.DIRECTION],
+            o=d[Column.ORIENTATION],
+            team=d[Column.TEAM],
+            official_position=d[Column.OFFICIAL_POSITION],
+            possession_team=d[Column.POSSESSION_TEAM],
+            height=d[Column.HEIGHT_CM],
+            weight=d[Column.WEIGHT_KG],
+            settings=self.settings,
+        )
+        return {
+            "e": pl.Series(
+                [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+            ),
+            "x": pl.Series(
+                [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64))
+            ),
+            "a": pl.Series(
+                [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32))
+            ),
+            "e_shape_0": edge_features.shape[0],
+            "e_shape_1": edge_features.shape[1],
+            "x_shape_0": node_features.shape[0],
+            "x_shape_1": node_features.shape[1],
+            "a_shape_0": adjacency_matrix.shape[0],
+            "a_shape_1": adjacency_matrix.shape[1],
+            self.graph_id_col: d[self.graph_id_col][0],
+            self.label_col: d[self.label_col][0],
+        }
 
-        return graph_df.drop("result_dict")
-
-    def to_graph_frames(self) -> List[dict]:
-        def __convert_to_graph_data_list(df):
-            lazy_df = df.lazy()
-
-            graph_list = []
-
-            for chunk in lazy_df.collect().iter_slices(self.chunk_size):
-                chunk_graph_list = [
-                    {
-                        "a": make_sparse(
-                            flatten_to_reshaped_array(
-                                arr=chunk["a"][i],
-                                s0=chunk["a_shape_0"][i],
-                                s1=chunk["a_shape_1"][i],
-                            )
-                        ),
-                        "x": flatten_to_reshaped_array(
-                            arr=chunk["x"][i],
-                            s0=chunk["x_shape_0"][i],
-                            s1=chunk["x_shape_1"][i],
-                        ),
-                        "e": flatten_to_reshaped_array(
-                            arr=chunk["e"][i],
-                            s0=chunk["e_shape_0"][i],
-                            s1=chunk["e_shape_1"][i],
-                        ),
-                        "y": np.asarray([chunk[self.label_col][i]]),
-                        "id": chunk[self.graph_id_col][i],
-                    }
-                    for i in range(len(chunk["a"]))
+    def _convert(self):
+        # Group and aggregate in one step
+        return (
+            self.dataset.group_by(Group.BY_FRAME, maintain_order=True)
+            .agg(
+                pl.map_groups(
+                    exprs=self.__exprs_variables, function=self.__compute
+                ).alias("result_dict")
+            )
+            .with_columns(
+                [
+                    *[
+                        pl.col("result_dict").struct.field(f).alias(f)
+                        for f in ["a", "e", "x", self.graph_id_col, self.label_col]
+                    ],
+                    *[
+                        pl.col("result_dict")
+                        .struct.field(f"{m}_shape_{i}")
+                        .alias(f"{m}_shape_{i}")
+                        for m in ["a", "e", "x"]
+                        for i in [0, 1]
+                    ],
                 ]
-                graph_list.extend(chunk_graph_list)
+            )
+            .drop("result_dict")
+        )
 
-            return graph_list
+    def to_graph_frames(self) -> List[dict]:
+        def process_chunk(chunk: pl.DataFrame) -> List[dict]:
+            return [
+                {
+                    "a": make_sparse(
+                        reshape_array(
+                            chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i]
+                        )
+                    ),
+                    "x": reshape_array(
+                        chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i]
+                    ),
+                    "e": reshape_array(
+                        chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i]
+                    ),
+                    "y": np.asarray([chunk[self.label_col][i]]),
+                    "id": chunk[self.graph_id_col][i],
+                }
+                for i in range(len(chunk))
+            ]
 
         graph_df = self._convert()
-        self.graph_frames = __convert_to_graph_data_list(graph_df)
-
+        self.graph_frames = [
+            graph
+            for chunk in graph_df.lazy().collect().iter_slices(self.chunk_size)
+            for graph in process_chunk(chunk)
+        ]
         return self.graph_frames
 
     def to_spektral_graphs(self) -> List[Graph]:
diff --git a/unravel/american_football/graphs/graph_settings.py b/unravel/american_football/graphs/graph_settings.py
index 5de30aa..9c96dfe 100644
--- a/unravel/american_football/graphs/graph_settings.py
+++ b/unravel/american_football/graphs/graph_settings.py
@@ -26,8 +26,6 @@ def __post_init__(self):
 @dataclass
 class AmericanFootballGraphSettings(DefaultGraphSettings):
     pitch_dimensions: AmericanFootballPitchDimensions = None
-    ball_id: str = "football"
-    qb_id: str = "QB"
     attacking_non_qb_node_value: float = 0.1
     max_height: float = 225.0  # in cm
     min_height: float = 150.0
diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py
index e9d66b5..b15c452 100644
--- a/unravel/soccer/graphs/dataset.py
+++ b/unravel/soccer/graphs/dataset.py
@@ -41,12 +41,12 @@ class Column:
     Y = "y"
     Z = "z"
 
-    V = "v"
+    SPEED = "v"
     VX = "vx"
     VY = "vy"
     VZ = "vz"
 
-    A = "a"
+    ACCELERATION = "a"
     AX = "ax"
     AY = "ay"
     AZ = "az"
@@ -67,14 +67,18 @@ class SoccerObject:
 
 @dataclass
 class KloppyPolarsDataset(DefaultDataset):
-    kloppy_dataset: TrackingDataset
-    ball_carrier_threshold: float = 25.0
-    _graph_id_column: str = field(default="graph_id")
-    _label_column: str = field(default="label")
-    _overwrite_orientation: bool = field(default=False, init=False)
-    _infer_goalkeepers: bool = field(default=False, init=False)
-
-    def __post_init__(self):
+    def __init__(
+        self,
+        kloppy_dataset: TrackingDataset,
+        ball_carrier_threshold: float = 25.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.kloppy_dataset = kloppy_dataset
+        self.ball_carrier_threshold = ball_carrier_threshold
+        self._overwrite_orientation: bool = False
+        self._infer_goalkeepers: bool = False
+
         if not isinstance(self.kloppy_dataset, TrackingDataset):
             raise Exception("'kloppy_dataset' should be of type float")
 
@@ -284,7 +288,7 @@ def __add_velocity(
                     + pl.col(Column.VZ) ** 2
                 )
                 .sqrt()
-                .alias(Column.V)
+                .alias(Column.SPEED)
             ]
         )
 
@@ -325,7 +329,7 @@ def __add_acceleration(self, df: pl.DataFrame):
                         + pl.col(Column.AZ) ** 2
                     )
                     .sqrt()
-                    .alias(Column.A)
+                    .alias(Column.ACCELERATION)
                 ]
             )
         )
diff --git a/unravel/soccer/graphs/features/edge_features_pl.py b/unravel/soccer/graphs/features/edge_features_pl.py
index 3852e6d..ce4defe 100644
--- a/unravel/soccer/graphs/features/edge_features_pl.py
+++ b/unravel/soccer/graphs/features/edge_features_pl.py
@@ -20,6 +20,8 @@
     normalize_accelerations_nfl,
 )
 
+from ..dataset import Constant
+
 
 def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, settings):
     # Compute pairwise distances using broadcasting
@@ -39,6 +41,7 @@ def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, sett
     speed_diff_matrix_normed = normalize_speed_differences_nfl(
         s=speed_diff_matrix,
         team=team,
+        ball_id=Constant.BALL,
         settings=settings,
     )
 
diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py
index 804ebd4..c95d8b2 100644
--- a/unravel/soccer/graphs/features/node_features_pl.py
+++ b/unravel/soccer/graphs/features/node_features_pl.py
@@ -18,6 +18,7 @@
     normalize_speed,
     distance_to_ball,
 )
+from ..dataset import Constant
 
 
 def compute_node_features_pl(
@@ -31,7 +32,7 @@ def compute_node_features_pl(
     ball_carrier,
     settings,
 ):
-    ball_id = settings.ball_id
+    ball_id = Constant.BALL
 
     goal_mouth_position = (
         settings.pitch_dimensions.x_dim.max,
@@ -60,7 +61,7 @@ def compute_node_features_pl(
         max_value=settings.pitch_dimensions.y_dim.max,
         min_value=settings.pitch_dimensions.y_dim.min,
     )
-    s_normed = normalize_speeds_nfl(s, team, settings)
+    s_normed = normalize_speeds_nfl(s, team, ball_id=Constant.BALL, settings=settings)
     uv_velocity = unit_vectors(velocity)
 
     angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0]))
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index 077e296..50d07b6 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -1,34 +1,16 @@
 import logging
 import sys
-from copy import deepcopy
 
-import pandas as pd
-
-import warnings
-
-from dataclasses import dataclass, field, asdict
+from dataclasses import dataclass
 
 from typing import List, Union, Dict, Literal, Any
 
 from kloppy.domain import (
-    TrackingDataset,
-    Frame,
-    Orientation,
-    DatasetTransformer,
-    DatasetFlag,
-    SecondSpectrumCoordinateSystem,
     MetricPitchDimensions,
 )
 
 from spektral.data import Graph
 
-from .exceptions import (
-    MissingLabelsError,
-    MissingDatasetError,
-    IncorrectDatasetTypeError,
-    KeyMismatchError,
-)
-
 from .graph_settings_pl import GraphSettingsPolars
 from .dataset import KloppyPolarsDataset, Column, Group, Constant
 from .features import (
@@ -106,11 +88,11 @@ def _apply_padding(self) -> pl.DataFrame:
             Column.VX,
             Column.VY,
             Column.VZ,
-            Column.V,
+            Column.SPEED,
             Column.AX,
             Column.AY,
             Column.AZ,
-            Column.A,
+            Column.ACCELERATION,
         ]
         group_by_columns = [
             Column.GAME_ID,
@@ -214,29 +196,29 @@ def _apply_filters(self):
         return self.dataset.with_columns(
             pl.when(
                 (pl.col(Column.OBJECT_ID) == Constant.BALL)
-                & (pl.col(Column.V) > self.settings.max_ball_speed)
+                & (pl.col(Column.SPEED) > self.settings.max_ball_speed)
             )
             .then(self.settings.max_ball_speed)
             .when(
                 (pl.col(Column.OBJECT_ID) != Constant.BALL)
-                & (pl.col(Column.V) > self.settings.max_player_speed)
+                & (pl.col(Column.SPEED) > self.settings.max_player_speed)
             )
             .then(self.settings.max_player_speed)
-            .otherwise(pl.col(Column.V))
-            .alias(Column.V)
+            .otherwise(pl.col(Column.SPEED))
+            .alias(Column.SPEED)
         ).with_columns(
             pl.when(
                 (pl.col(Column.OBJECT_ID) == Constant.BALL)
-                & (pl.col(Column.A) > self.settings.max_ball_acceleration)
+                & (pl.col(Column.ACCELERATION) > self.settings.max_ball_acceleration)
             )
             .then(self.settings.max_ball_acceleration)
             .when(
                 (pl.col(Column.OBJECT_ID) != Constant.BALL)
-                & (pl.col(Column.A) > self.settings.max_player_acceleration)
+                & (pl.col(Column.ACCELERATION) > self.settings.max_player_acceleration)
             )
             .then(self.settings.max_player_acceleration)
-            .otherwise(pl.col(Column.A))
-            .alias(Column.A)
+            .otherwise(pl.col(Column.ACCELERATION))
+            .alias(Column.ACCELERATION)
         )
 
     def _apply_settings(self):
@@ -290,11 +272,11 @@ def __exprs_variables(self):
             Column.X,
             Column.Y,
             Column.Z,
-            Column.V,
+            Column.SPEED,
             Column.VX,
             Column.VY,
             Column.VZ,
-            Column.A,
+            Column.ACCELERATION,
             Column.AX,
             Column.AY,
             Column.AZ,
@@ -321,6 +303,7 @@ def __compute(self, args: List[pl.Series]) -> dict:
                 """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
                 make sure this is not the case. Each group can only have 1 label."""
             )
+
         ball_carriers = np.where(d[Column.IS_BALL_CARRIER] == True)[0]
         if len(ball_carriers) == 0:
             ball_carrier_idx = None
@@ -339,7 +322,7 @@ def __compute(self, args: List[pl.Series]) -> dict:
             adjacency_matrix=adjacency_matrix,
             p3d=np.stack((d[Column.X], d[Column.Y], d[Column.Z]), axis=-1),
             p2d=np.stack((d[Column.X], d[Column.Y]), axis=-1),
-            s=d[Column.V],
+            s=d[Column.SPEED],
             velocity=velocity,
             team=d[Column.TEAM_ID],
             settings=self.settings,
@@ -348,7 +331,7 @@ def __compute(self, args: List[pl.Series]) -> dict:
         node_features = compute_node_features_pl(
             d[Column.X],
             d[Column.Y],
-            s=d[Column.V],
+            s=d[Column.SPEED],
             velocity=velocity,
             team=d[Column.TEAM_ID],
             possession_team=d[Column.BALL_OWNING_TEAM_ID],
@@ -377,77 +360,6 @@ def __compute(self, args: List[pl.Series]) -> dict:
             self.label_col: d[self.label_col][0],
         }
 
-    # def _convert(self):
-    #     result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg(
-    #         pl.map_groups(
-    #             exprs=self.__exprs_variables,
-    #             function=self.__compute,
-    #         ).alias("result_dict")
-    #     )
-
-    #     graph_df = result_df.with_columns(
-    #         [
-    #             pl.col("result_dict").struct.field("a").alias("a"),
-    #             pl.col("result_dict").struct.field("e").alias("e"),
-    #             pl.col("result_dict").struct.field("x").alias("x"),
-    #             pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"),
-    #             pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"),
-    #             pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"),
-    #             pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"),
-    #             pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"),
-    #             pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"),
-    #             pl.col("result_dict")
-    #             .struct.field(self.graph_id_col)
-    #             .alias(self.graph_id_col),
-    #             pl.col("result_dict")
-    #             .struct.field(self.label_col)
-    #             .alias(self.label_col),
-    #         ]
-    #     )
-
-    #     return graph_df.drop("result_dict")
-
-    # def to_graph_frames(self) -> List[dict]:
-    #     def __convert_to_graph_data_list(df):
-    #         lazy_df = df.lazy()
-
-    #         graph_list = []
-
-    #         for chunk in lazy_df.collect().iter_slices(self.chunk_size):
-    #             chunk_graph_list = [
-    #                 {
-    #                     "a": make_sparse(
-    #                         flatten_to_reshaped_array(
-    #                             arr=chunk["a"][i],
-    #                             s0=chunk["a_shape_0"][i],
-    #                             s1=chunk["a_shape_1"][i],
-    #                         )
-    #                     ),
-    #                     "x": flatten_to_reshaped_array(
-    #                         arr=chunk["x"][i],
-    #                         s0=chunk["x_shape_0"][i],
-    #                         s1=chunk["x_shape_1"][i],
-    #                     ),
-    #                     "e": flatten_to_reshaped_array(
-    #                         arr=chunk["e"][i],
-    #                         s0=chunk["e_shape_0"][i],
-    #                         s1=chunk["e_shape_1"][i],
-    #                     ),
-    #                     "y": np.asarray([chunk[self.label_col][i]]),
-    #                     "id": chunk[self.graph_id_col][i],
-    #                 }
-    #                 for i in range(len(chunk["a"]))
-    #             ]
-    #             graph_list.extend(chunk_graph_list)
-
-    #         return graph_list
-
-    #     graph_df = self._convert()
-    #     self.graph_frames = __convert_to_graph_data_list(graph_df)
-
-    #     return self.graph_frames
-
-    ###
     def _convert(self):
         # Group and aggregate in one step
         return (
@@ -475,23 +387,19 @@ def _convert(self):
             .drop("result_dict")
         )
 
-    @staticmethod
-    def _reshape_array(arr, s0, s1):
-        return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1)
-
     def to_graph_frames(self) -> List[dict]:
         def process_chunk(chunk: pl.DataFrame) -> List[dict]:
             return [
                 {
                     "a": make_sparse(
-                        self._reshape_array(
+                        reshape_array(
                             chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i]
                         )
                     ),
-                    "x": self._reshape_array(
+                    "x": reshape_array(
                         chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i]
                     ),
-                    "e": self._reshape_array(
+                    "e": reshape_array(
                         chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i]
                     ),
                     "y": np.asarray([chunk[self.label_col][i]]),
@@ -508,8 +416,6 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]:
         ]
         return self.graph_frames
 
-    ###
-
     def to_spektral_graphs(self) -> List[Graph]:
         if not self.graph_frames:
             self.to_graph_frames()
diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py
index c11e8f3..ef89bc2 100644
--- a/unravel/utils/features/utils.py
+++ b/unravel/utils/features/utils.py
@@ -146,8 +146,8 @@ def normalize_acceleration(value, max_acceleration):
     return np.clip(x, -1, 1)
 
 
-def normalize_speeds_nfl(s, team, settings):
-    ball_mask = team == settings.ball_id
+def normalize_speeds_nfl(s, team, ball_id, settings):
+    ball_mask = team == ball_id
     s_normed = np.zeros_like(s)
 
     s_normed[ball_mask] = normalize_speed(s[ball_mask], settings.max_ball_speed)
@@ -156,13 +156,13 @@ def normalize_speeds_nfl(s, team, settings):
     return s_normed
 
 
-def normalize_speed_differences_nfl(s, team, settings):
+def normalize_speed_differences_nfl(s, team, ball_id, settings):
 
-    return normalize_speeds_nfl(s, team, settings) * np.sign(s)
+    return normalize_speeds_nfl(s, team, ball_id, settings) * np.sign(s)
 
 
-def normalize_accelerations_nfl(a, team, settings):
-    ball_mask = team == settings.ball_id
+def normalize_accelerations_nfl(a, team, ball_id, settings):
+    ball_mask = team == ball_id
     a_normed = np.zeros_like(a)
 
     a_normed[ball_mask] = normalize_acceleration(
@@ -183,6 +183,10 @@ def flatten_to_reshaped_array(arr, s0, s1, as_list=False):
     return result_array if not as_list else result_array.tolist()
 
 
+def reshape_array(arr, s0, s1):
+    return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1)
+
+
 def distance_to_ball(
     x: np.array, y: np.array, team: np.array, ball_id: str, z: np.array = None
 ):
diff --git a/unravel/utils/objects/default_dataset.py b/unravel/utils/objects/default_dataset.py
index b31280e..17ad9b6 100644
--- a/unravel/utils/objects/default_dataset.py
+++ b/unravel/utils/objects/default_dataset.py
@@ -1,8 +1,11 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 
 @dataclass
 class DefaultDataset:
+    _graph_id_column: str = field(default="graph_id")
+    _label_column: str = field(default="label")
+
     def load(self):
         raise NotImplementedError()
 
diff --git a/unravel/utils/objects/default_graph_converter.py b/unravel/utils/objects/default_graph_converter.py
index dd3f3f5..79bc16e 100644
--- a/unravel/utils/objects/default_graph_converter.py
+++ b/unravel/utils/objects/default_graph_converter.py
@@ -152,6 +152,9 @@ def __post_init__(self):
         if not isinstance(self.verbose, bool):
             raise Exception("'verbose' should be of type boolean (bool)")
 
+    def _shuffle(self):
+        raise NotImplementedError()
+
     def _sport_specific_checks(self):
         raise NotImplementedError(
             "No sport specific checks implementend... Make sure to check for existens of labels of some sort, and graph ids of some sort..."

From 08e79c398220359521e21106a8cd9a03cb05c822 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 13:38:51 +0100
Subject: [PATCH 06/10] deprecation warning

---
 unravel/soccer/graphs/graph_converter.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py
index 31c093b..d57cb5c 100644
--- a/unravel/soccer/graphs/graph_converter.py
+++ b/unravel/soccer/graphs/graph_converter.py
@@ -4,7 +4,7 @@
 
 from scipy.spatial.qhull import QhullError
 
-import warnings
+from warnings import warn, simplefilter
 
 from dataclasses import dataclass, field, asdict
 
@@ -33,6 +33,9 @@
 
 from ...utils import *
 
+simplefilter("always", DeprecationWarning)
+
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 stdout_handler = logging.StreamHandler(sys.stdout)
@@ -80,6 +83,14 @@ class SoccerGraphConverter(DefaultGraphConverter):
     non_potential_receiver_node_value: float = 0.1
 
     def __post_init__(self):
+        warn(
+            """
+            This class is deprecated and will be removed in a future release. Please use SoccerGraphConverterPolars for better performance.
+            Note: SoccerGraphConverterPolars is not one-to-one compatible with models and dataset created from SoccerGraphConverter due to breaking changes.
+            """,
+            category=DeprecationWarning,
+            stacklevel=2,
+        )
         if not self.dataset:
             raise Exception("Please provide a 'kloppy' dataset.")
 
@@ -204,7 +215,7 @@ def _convert(self, frame: Frame):
 
             if not self.prediction and label is None:
                 if self.settings.verbose:
-                    warnings.warn(
+                    warn(
                         f"""No label for frame={frame.frame_id} in 'labels'...""",
                         NoLabelWarning,
                     )

From ce14e38a78f89c846f117c4ba873572e4bb0b3b4 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 13:50:51 +0100
Subject: [PATCH 07/10] kloppy 3.16

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 72295da..089acfb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy==1.26.4
 spektral==1.2.0
-kloppy==3.15.0
+kloppy==3.16.0
 tensorflow>=2.14.0; platform_machine != 'arm64' or platform_system != 'Darwin'
 tensorflow-macos>=2.14.0; platform_machine == 'arm64' and platform_system == 'Darwin'
 keras==2.14.0
diff --git a/setup.py b/setup.py
index 9f4fa76..9ec6b79 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def read_version():
     python_requires="~=3.11",
     install_requires=[
         "spektral==1.2.0",
-        "kloppy==3.15.0",
+        "kloppy==3.16.0",
         "tensorflow>=2.14.0;platform_machine != 'arm64' or platform_system != 'Darwin'",
         "tensorflow-macos>=2.14.0;platform_machine == 'arm64' and platform_system == 'Darwin'",
         "keras==2.14.0",

From 676014ebfcd6831e001dc1e7f6c848bb8a54f6b6 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Sun, 26 Jan 2025 17:26:23 +0100
Subject: [PATCH 08/10] minor

---
 unravel/__init__.py                           |  2 +-
 unravel/american_football/graphs/dataset.py   |  2 +-
 .../graphs/graph_converter.py                 | 42 +++++++++++------
 unravel/soccer/graphs/graph_converter.py      |  1 -
 unravel/soccer/graphs/graph_converter_pl.py   | 46 ++++++++++++-------
 .../utils/objects/default_graph_converter.py  |  3 ++
 6 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/unravel/__init__.py b/unravel/__init__.py
index b0cda09..b235f04 100644
--- a/unravel/__init__.py
+++ b/unravel/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 
 from .soccer import *
 from .american_football import *
diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py
index 4b8ccff..5273b4a 100644
--- a/unravel/american_football/graphs/dataset.py
+++ b/unravel/american_football/graphs/dataset.py
@@ -125,7 +125,7 @@ def load(self):
             separator=",",
             encoding="utf8",
             null_values=["NA", "NULL", ""],
-            dtypes={"birthDate": pl.Date},
+            schema_overrides={"birthDate": pl.Date},
             ignore_errors=True,
         )
         if "position" in players.columns:
diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py
index 07b01e1..5f899b7 100644
--- a/unravel/american_football/graphs/graph_converter.py
+++ b/unravel/american_football/graphs/graph_converter.py
@@ -48,8 +48,14 @@ def __init__(
         if not isinstance(dataset, BigDataBowlDataset):
             raise Exception("'dataset' should be an instance of BigDataBowlDataset")
 
-        self.label_col = dataset._label_column
-        self.graph_id_col = dataset._graph_id_column
+        self.label_column: str = (
+            self.label_col if self.label_col is not None else dataset._label_column
+        )
+        self.graph_id_column: str = (
+            self.graph_id_col
+            if self.graph_id_col is not None
+            else dataset._graph_id_column
+        )
 
         self.dataset: pl.DataFrame = dataset.data
         self.pitch_dimensions: AmericanFootballPitchDimensions = (
@@ -64,21 +70,21 @@ def __init__(
 
     def _sport_specific_checks(self):
 
-        if not isinstance(self.label_col, str):
+        if not isinstance(self.label_column, str):
             raise Exception("'label_col' should be of type string (str)")
 
-        if not isinstance(self.graph_id_col, str):
+        if not isinstance(self.graph_id_column, str):
             raise Exception("'graph_id_col' should be of type string (str)")
 
         if not isinstance(self.chunk_size, int):
             raise Exception("chunk_size should be of type integer (int)")
 
-        if not self.label_col in self.dataset.columns and not self.prediction:
+        if not self.label_column in self.dataset.columns and not self.prediction:
             raise Exception(
                 "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on."
             )
 
-        if not self.graph_id_col in self.dataset.columns:
+        if not self.graph_id_column in self.dataset.columns:
             raise Exception(
                 "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
             )
@@ -121,20 +127,20 @@ def __exprs_variables(self):
             Column.POSSESSION_TEAM,
             Column.HEIGHT_CM,
             Column.WEIGHT_KG,
-            self.graph_id_col,
-            self.label_col,
+            self.graph_id_column,
+            self.label_column,
         ]
 
     def __compute(self, args: List[pl.Series]) -> dict:
         d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
 
-        if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]):
+        if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]):
             raise Exception(
                 "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
 
         if not self.prediction and not np.all(
-            d[self.label_col] == d[self.label_col][0]
+            d[self.label_column] == d[self.label_column][0]
         ):
             raise Exception(
                 """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
@@ -186,8 +192,8 @@ def __compute(self, args: List[pl.Series]) -> dict:
             "x_shape_1": node_features.shape[1],
             "a_shape_0": adjacency_matrix.shape[0],
             "a_shape_1": adjacency_matrix.shape[1],
-            self.graph_id_col: d[self.graph_id_col][0],
-            self.label_col: d[self.label_col][0],
+            self.graph_id_column: d[self.graph_id_column][0],
+            self.label_column: d[self.label_column][0],
         }
 
     def _convert(self):
@@ -203,7 +209,13 @@ def _convert(self):
                 [
                     *[
                         pl.col("result_dict").struct.field(f).alias(f)
-                        for f in ["a", "e", "x", self.graph_id_col, self.label_col]
+                        for f in [
+                            "a",
+                            "e",
+                            "x",
+                            self.graph_id_column,
+                            self.label_column,
+                        ]
                     ],
                     *[
                         pl.col("result_dict")
@@ -232,8 +244,8 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]:
                     "e": reshape_array(
                         chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i]
                     ),
-                    "y": np.asarray([chunk[self.label_col][i]]),
-                    "id": chunk[self.graph_id_col][i],
+                    "y": np.asarray([chunk[self.label_column][i]]),
+                    "id": chunk[self.graph_id_column][i],
                 }
                 for i in range(len(chunk))
             ]
diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py
index d57cb5c..1262598 100644
--- a/unravel/soccer/graphs/graph_converter.py
+++ b/unravel/soccer/graphs/graph_converter.py
@@ -71,7 +71,6 @@ class SoccerGraphConverter(DefaultGraphConverter):
     dataset: TrackingDataset = None
     labels: dict = None
 
-    labels: dict = None
     graph_id: Union[str, int, dict] = None
     graph_ids: dict = None
 
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index 50d07b6..5beeb82 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -45,8 +45,14 @@ class SoccerGraphConverterPolars(DefaultGraphConverter):
 
     def __post_init__(self):
         self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions
-        self.label_col = self.dataset._label_column
-        self.graph_id_col = self.dataset._graph_id_column
+        self.label_column: str = (
+            self.label_col if self.label_col is not None else self.dataset._label_column
+        )
+        self.graph_id_column: str = (
+            self.graph_id_col
+            if self.graph_id_col is not None
+            else self.dataset._graph_id_column
+        )
 
         self.dataset = self.dataset.data
 
@@ -76,8 +82,8 @@ def _apply_padding(self) -> pl.DataFrame:
             Column.TIMESTAMP,
             Column.BALL_STATE,
             Column.POSITION_NAME,
-            self.label_col,
-            self.graph_id_col,
+            self.label_column,
+            self.graph_id_column,
         ]
         empty_columns = [
             Column.OBJECT_ID,
@@ -240,21 +246,21 @@ def _apply_settings(self):
         )
 
     def _sport_specific_checks(self):
-        if not isinstance(self.label_col, str):
+        if not isinstance(self.label_column, str):
             raise Exception("'label_col' should be of type string (str)")
 
-        if not isinstance(self.graph_id_col, str):
+        if not isinstance(self.graph_id_column, str):
             raise Exception("'graph_id_col' should be of type string (str)")
 
         if not isinstance(self.chunk_size, int):
             raise Exception("chunk_size should be of type integer (int)")
 
-        if not self.label_col in self.dataset.columns and not self.prediction:
+        if not self.label_column in self.dataset.columns and not self.prediction:
             raise Exception(
                 "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on."
             )
 
-        if not self.graph_id_col in self.dataset.columns:
+        if not self.graph_id_column in self.dataset.columns:
             raise Exception(
                 "Please specify a 'graph_id_col' and add that column to your 'dataset' ..."
             )
@@ -284,20 +290,20 @@ def __exprs_variables(self):
             Column.POSITION_NAME,
             Column.BALL_OWNING_TEAM_ID,
             Column.IS_BALL_CARRIER,
-            self.graph_id_col,
-            self.label_col,
+            self.graph_id_column,
+            self.label_column,
         ]
 
     def __compute(self, args: List[pl.Series]) -> dict:
         d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
 
-        if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]):
+        if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]):
             raise Exception(
                 "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
 
         if not self.prediction and not np.all(
-            d[self.label_col] == d[self.label_col][0]
+            d[self.label_column] == d[self.label_column][0]
         ):
             raise Exception(
                 """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
@@ -356,8 +362,8 @@ def __compute(self, args: List[pl.Series]) -> dict:
             "x_shape_1": node_features.shape[1],
             "a_shape_0": adjacency_matrix.shape[0],
             "a_shape_1": adjacency_matrix.shape[1],
-            self.graph_id_col: d[self.graph_id_col][0],
-            self.label_col: d[self.label_col][0],
+            self.graph_id_column: d[self.graph_id_column][0],
+            self.label_column: d[self.label_column][0],
         }
 
     def _convert(self):
@@ -373,7 +379,13 @@ def _convert(self):
                 [
                     *[
                         pl.col("result_dict").struct.field(f).alias(f)
-                        for f in ["a", "e", "x", self.graph_id_col, self.label_col]
+                        for f in [
+                            "a",
+                            "e",
+                            "x",
+                            self.graph_id_column,
+                            self.label_column,
+                        ]
                     ],
                     *[
                         pl.col("result_dict")
@@ -402,8 +414,8 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]:
                     "e": reshape_array(
                         chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i]
                     ),
-                    "y": np.asarray([chunk[self.label_col][i]]),
-                    "id": chunk[self.graph_id_col][i],
+                    "y": np.asarray([chunk[self.label_column][i]]),
+                    "id": chunk[self.graph_id_column][i],
                 }
                 for i in range(len(chunk))
             ]
diff --git a/unravel/utils/objects/default_graph_converter.py b/unravel/utils/objects/default_graph_converter.py
index 79bc16e..dfc9133 100644
--- a/unravel/utils/objects/default_graph_converter.py
+++ b/unravel/utils/objects/default_graph_converter.py
@@ -87,6 +87,9 @@ class DefaultGraphConverter:
     pad: bool = False
     verbose: bool = False
 
+    label_col: str = None
+    graph_id_col: str = None
+
     graph_frames: dict = field(init=False, repr=False, default=None)
     settings: DefaultGraphSettings = field(
         init=False, repr=False, default_factory=DefaultGraphSettings

From 2b4e6a84234f531966c5984bbdf9e97a4cb780f4 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Mon, 27 Jan 2025 09:35:04 +0100
Subject: [PATCH 09/10] updated examples

---
 examples/1_kloppy_gnn_train.ipynb            | 143 ++--
 examples/deprecated/1_kloppy_gnn_train.ipynb | 794 +++++++++++++++++++
 examples/graphs_faq.md                       |   6 +-
 unravel/soccer/graphs/graph_converter_pl.py  |   9 +-
 4 files changed, 896 insertions(+), 56 deletions(-)
 create mode 100644 examples/deprecated/1_kloppy_gnn_train.ipynb

diff --git a/examples/1_kloppy_gnn_train.ipynb b/examples/1_kloppy_gnn_train.ipynb
index be6bd2e..39aee88 100644
--- a/examples/1_kloppy_gnn_train.ipynb
+++ b/examples/1_kloppy_gnn_train.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 🌀 unravel kloppy into graph neural network!\n",
+    "## 🌀 unravel kloppy into graph neural network using the _new_ Polars back-end!\n",
     "\n",
     "First run `pip install unravelsports` if you haven't already!\n",
     "\n",
@@ -25,11 +25,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library.\n",
+    "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library, and a newly added (version==0.3.0+) [Polars](https://pola.rs/) back-end.\n",
     "\n",
     "This walkthrough will touch on a lot of the concepts from [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn). It is strongly advised to first read the [research paper (pdf)](https://ussf-ssac-23-soccer-gnn.s3.us-east-2.amazonaws.com/public/Sahasrabudhe_Bekkers_SSAC23.pdf). Some concepts are also explained in the [Graphs FAQ](graphs_faq.md).\n",
     "\n",
-    "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n",
+    "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into a `KloppyPolarsDataset`, convert it into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n",
     "\n",
     "The powerful Kloppy package allows us to load and standardize data from many providers: Metrica, Sportec, Tracab, SecondSpectrum, StatsPerform and SkillCorner. In this guide we'll use some matches from the [Public SkillCorner Dataset](https://github.com/SkillCorner/opendata).\n",
     "\n",
@@ -42,7 +42,7 @@
     "\n",
     "- [**1. Imports**](#1-imports).\n",
     "- [**2. Public SkillCorner Data**](#2-public-skillcorner-data).\n",
-    "- [**3. Graph Converter**](#2-open-skillcorner-data).\n",
+    "- [**3. ⭐ _KloppyPolarsDataset_ and _SoccerGraphConverterPolars_**](#2-open-skillcorner-data).\n",
     "- [**4. Load Kloppy Data, Convert & Store**](#4-load-kloppy-data-convert-and-store).\n",
     "- [**5. Creating a Custom Graph Dataset**](#5-creating-a-custom-graph-dataset).\n",
     "- [**6. Prepare for Training**](#6-prepare-for-training).\n",
@@ -68,18 +68,18 @@
    "source": [
     "### 1. Imports\n",
     "\n",
-    "We import `SoccerGraphConverter` to help us convert from Kloppy positional tracking frames to graphs.\n",
+    "We import `SoccerGraphConverterPolars` to help us convert from Kloppy positional tracking frames to graphs.\n",
     "\n",
     "With the power of **Kloppy** we can also load data from many providers by importing `metrica`, `sportec`, `tracab`, `secondspectrum`, or `statsperform` from `kloppy`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from unravel.soccer import SoccerGraphConverter\n",
+    "from unravel.soccer import SoccerGraphConverterPolars, KloppyPolarsDataset\n",
     "\n",
     "from kloppy import skillcorner"
    ]
@@ -128,33 +128,73 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 3. Graph Converter\n",
+    "### 3. ⭐ _KloppyPolarsDataset_ and _SoccerGraphConverterPolars_\n",
     "\n",
     "ℹ️ For more information on:\n",
     "- What a Graph is, check out [Graph FAQ Section A](graphs_faq.ipynb)\n",
-    "- What parameters we can pass to the `SoccerGraphConverter`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n",
+    "- What parameters we can pass to the `SoccerGraphConverterPolars`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n",
     "- What features each Graph has, check out [Graph FAQ Section C](graphs_faq.ipynb)\n",
     "\n",
-    "---\n",
+    "------\n",
     "\n",
-    "To get started with the `SoccerGraphConverter` we need to pass one _required_ parameter:\n",
-    "- `dataset` (of type `TrackingDataset` (Kloppy)) \n",
+    "To get started we need to load our tracking data using Kloppy, and subsequently pass this to the `KloppyPolarsDataset`. This `KloppyPolarsDataset` also takes the `ball_carrier_threshold` parameter.\n",
     "\n",
-    "And one parameter that's required when we're converting for training purposes (more on this later):\n",
-    "- `labels` (a dictionary with `frame_id`s as keys and a value of `{True, False, 1 or 0}`).\n",
-    "```python\n",
-    "{83340: True, 83341: False, etc..} = {83340: 1, 83341: 0, etc..} =  {83340: 1, 83341: False, etc..}\n",
-    "```\n",
-    "⚠️ As mentioned before you will need to create your own labels! In this example we'll use `dummy_labels(dataset)` to generate a fake label for each frame.\n",
+    "🗒️ KloppyPolarsDataset sets the orientation to `Orientation.BALL_OWNING_TEAM` (ball owning team plays left to right). Except when we don't know who the ball owning team is. This can happen when a data provider does not provide the ball owning team information.\n",
+    "If our dataset does not have the ball owning team we infer the ball owning team automatically using the `ball_carrier_threshold` and subsequently change the orientation automatically to be left to right for the ball owning team too.\n",
+    "In `SoccerGraphConverter` [deprecated] if the ball owning team was not available we set the orientation to STATIC_HOME_AWAY meaning attacking could happen in two directions. \n",
+    "\n",
+    "<div style=\"border: 2px solid #ddd; border-radius: 5px; padding: 10px; background-color: ##282C34;\">\n",
+    "<pre>\n",
+    "kloppy_dataset = skillcorner.load_open_data(\n",
+    "    match_id=match_id,\n",
+    "    coordinates=\"secondspectrum\",\n",
+    "    include_empty_frames=False,\n",
+    "    limit=500,  \n",
+    ")\n",
+    "kloppy_polars_dataset = KloppyPolarsDataset(\n",
+    "    kloppy_dataset=kloppy_dataset,\n",
+    "    ball_carrier_threshold=25.0\n",
+    ")\n",
+    "kloppy_polars_dataset.load()\n",
+    "</pre>\n",
+    "</div>\n",
     "\n",
     "#### Graph Identifier(s):\n",
-    "When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. To make this simple, there are two _optional_ parameters we can pass to `SoccerGraphConverter`, namely:\n",
-    "- `graph_id`. This is a single identifier (str or int) for a whole match, for example the unique match id.\n",
-    "- `graph_ids`. This is a dictionary with the same keys as `labels`, but the values are now the unique identifiers. This option can be used if we want to split by sequence or possession_id. For example: {frame_id: 'matchId-sequenceId', frame_id: 'match_Id-sequenceId2'} etc. You will need to create your own ids. Note, if `labels` and `graph_ids` don't have the exact same keys it will throw an error.\n",
+    "After loading the `kloppy_polars_dataset` we now add graph identifiers. We can do this by passing a list of column names on which we want to split our data.\n",
+    "\n",
+    "🗒️ When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information).\n",
+    "\n",
+    "\n",
+    "<div style=\"border: 2px solid #ddd; border-radius: 5px; padding: 10px; background-color: ##282C34;\">\n",
+    "<pre>\n",
+    "kloppy_polars_dataset.add_graph_ids(by=[\"game_id\", \"period_id\"])\n",
+    "</pre>\n",
+    "</div>\n",
+    "\n",
+    "#### Graph Labels\n",
     "\n",
-    "In this example we'll use the `graph_id=match_id` as the unique identifier, but feel free to change that for `graph_ids=dummy_graph_ids(dataset)` to test out that behavior.\n",
+    "Now, we can add our (binary) labels to the dataset. In all examples we do this using `kloppy_polars_dataset.add_dummy_labels()`, but these are random labels and will not help with training.\n",
     "\n",
-    "Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information)."
+    "To add useful labels for your task you need to \"join\" a Polars dataframe that contains a column with the required labels to the `kloppy_polars_dataset.data` Polars dataframe. Please note that in this dataframe each row is a single player (or ball) object, and thus each `frame_id` has 23 rows (if all players and ball are observed). All these rows (for a single frame_id) need to have _the same_ label. If your label column is not named `\"label\"` you need to pass the `label_col` (str) parameter to `SoccerGraphConverterPolars`.\n",
+    "\n",
+    "<div style=\"border: 2px solid #ddd; border-radius: 5px; padding: 10px; background-color: ##282C34;\">\n",
+    "<pre>\n",
+    "kloppy_polars_dataset.data = (\n",
+    "    kloppy_polars_dataset.data\n",
+    "    .join(\n",
+    "        some_label_dataframe.select([\"game_id\", \"period_id\", \"frame_id\", \"label\"]), \n",
+    "        on=[\"game_id\", \"period_id\", \"frame_id\"],\n",
+    "        how=\"left\"\n",
+    "    )\n",
+    "</pre>\n",
+    "</div>\n",
+    "\n",
+    "### SoccerGraphConverterPolars\n",
+    "\n",
+    "To get started with the `SoccerGraphConverterPolars` we need to pass one _required_ parameter:\n",
+    "- `dataset` (of type `KloppyPolarsDataset`) \n",
+    "\n",
+    "For a full list of other parameters we can pass to the `SoccerGraphConverterPolars`, check out [Graph FAQ Section B](graphs_faq.ipynb)"
    ]
   },
   {
@@ -174,9 +214,7 @@
     "As mentioned in [Section 2](#2-public-skillcorner-data) we will use 4 matches of SkillCorner data. In the below example we will load the first 500 frames of data from each of these 4 games (we set `limit=500`) to create a dataset of 2,000 samples (Note: We're going to actually have less than 2,000 samples because setting `include_empty_frames=False` means we'll skip some frames in our conversion step).\n",
     "\n",
     "Important things to note:\n",
-    "- We import `dummy_labels` to randomly generate binary labels. Training with these random labels will not create a good model.\n",
-    "- We import `dummy_graph_ids` to generate fake graph labels.\n",
-    "- The `SoccerGraphConverter` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n",
+    "- The `SoccerGraphConverterPolars` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n",
     "- We will end up with fewer than 2,000 eventhough we set `limit=500` frames because we set `include_empty_frames=False` and all frames without ball coordinates are automatically ommited.\n",
     "- When using other providers always set `include_empty_frames=False` or `only_alive=True`.\n",
     "- We store the data as individual compressed pickle files, one file for per match. The data that gets stored in the pickle is a list of dictionaries, one dictionary per frame. Each dictionary has keys for the adjacency matrix, node features, edge features, label and graph id."
@@ -184,25 +222,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Processing frames: 100%|██████████| 500/500 [00:02<00:00, 244.81it/s]\n",
-      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.65it/s]\n",
-      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 343.58it/s] \n",
-      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.17it/s]\n"
+      "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n",
+      "                This operation dropped 8 incomplete frames out of 488 total frames (1.64%)\n",
+      "                \n",
+      "  warnings.warn(\n",
+      "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n",
+      "                This operation dropped 96 incomplete frames out of 487 total frames (19.71%)\n",
+      "                \n",
+      "  warnings.warn(\n",
+      "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n",
+      "                This operation dropped 156 incomplete frames out of 494 total frames (31.58%)\n",
+      "                \n",
+      "  warnings.warn(\n",
+      "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n",
+      "                This operation dropped 87 incomplete frames out of 500 total frames (17.40%)\n",
+      "                \n",
+      "  warnings.warn(\n"
      ]
     }
    ],
    "source": [
     "from os.path import exists\n",
     "\n",
-    "from unravel.utils import dummy_labels, dummy_graph_ids\n",
-    "\n",
     "match_ids = [4039, 3749, 3518, 3442]\n",
     "pickle_folder = \"pickles\"\n",
     "compressed_pickle_file_path = \"{pickle_folder}/{match_id}.pickle.gz\"\n",
@@ -213,33 +261,33 @@
     "    )\n",
     "    # if the output file already exists, skip this whole step\n",
     "    if not exists(match_pickle_file_path):\n",
-    "\n",
     "        # Load Kloppy dataset\n",
-    "        dataset = skillcorner.load_open_data(\n",
+    "        kloppy_dataset = skillcorner.load_open_data(\n",
     "            match_id=match_id,\n",
     "            coordinates=\"secondspectrum\",\n",
     "            include_empty_frames=False,\n",
-    "            limit=500,  # limit to 500 frames in this example\n",
+    "            limit=500,  \n",
+    "        )\n",
+    "        dataset = KloppyPolarsDataset(\n",
+    "            kloppy_dataset=kloppy_dataset,\n",
+    "            ball_carrier_threshold=25.0\n",
     "        )\n",
+    "        dataset.load()\n",
+    "        \n",
+    "        dataset.add_graph_ids()\n",
+    "        \n",
+    "        dataset.add_dummy_labels()\n",
     "\n",
     "        # Initialize the Graph Converter, with dataset, labels and settings\n",
-    "        converter = SoccerGraphConverter(\n",
+    "        converter = SoccerGraphConverterPolars(\n",
     "            dataset=dataset,\n",
-    "            # create fake labels\n",
-    "            labels=dummy_labels(dataset),\n",
-    "            graph_id=match_id,\n",
-    "            # graph_ids=dummy_graph_ids(dataset),\n",
     "            # Settings\n",
-    "            ball_carrier_treshold=25.0,\n",
     "            max_player_speed=12.0,\n",
     "            max_ball_speed=28.0,\n",
-    "            boundary_correction=None,\n",
     "            self_loop_ball=True,\n",
     "            adjacency_matrix_connect_type=\"ball\",\n",
     "            adjacency_matrix_type=\"split_by_team\",\n",
     "            label_type=\"binary\",\n",
-    "            infer_ball_ownership=True,\n",
-    "            infer_goalkeepers=True,\n",
     "            defending_team_node_value=0.1,\n",
     "            non_potential_receiver_node_value=0.1,\n",
     "            random_seed=False,\n",
@@ -254,7 +302,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "ℹ️ For a full table of parameters we can pass to the `SoccerGraphConverter` check out [Graph FAQ Section B](graphs_faq.ipynb)\n",
     "\n",
     "-----"
    ]
@@ -303,7 +350,7 @@
     "Our `dataset` object has two custom methods to help split the data into train, test and validation sets.\n",
     "Either use `dataset.split_test_train()` if we don't need a validation set, or `dataset.split_test_train_validation()` if we do also require a validation set.\n",
     "\n",
-    "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverter` using the 'graph_id' or 'graph_ids' parameter.\n",
+    "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverterPolars` using the 'graph_id' or 'graph_ids' parameter.\n",
     "\n",
     "The 'split_train', 'split_test' and 'split_validation' parameters can either be ratios, percentages or relative size compared to total. \n",
     "\n",
@@ -786,7 +833,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/examples/deprecated/1_kloppy_gnn_train.ipynb b/examples/deprecated/1_kloppy_gnn_train.ipynb
new file mode 100644
index 0000000..b7a3089
--- /dev/null
+++ b/examples/deprecated/1_kloppy_gnn_train.ipynb
@@ -0,0 +1,794 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 🌀 unravel kloppy into graph neural network!\n",
+    "\n",
+    "First run `pip install unravelsports` if you haven't already!\n",
+    "\n",
+    "\n",
+    "-----\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install unravelsports --quiet"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library.\n",
+    "\n",
+    "This walkthrough will touch on a lot of the concepts from [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn). It is strongly advised to first read the [research paper (pdf)](https://ussf-ssac-23-soccer-gnn.s3.us-east-2.amazonaws.com/public/Sahasrabudhe_Bekkers_SSAC23.pdf). Some concepts are also explained in the [Graphs FAQ](graphs_faq.md).\n",
+    "\n",
+    "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n",
+    "\n",
+    "The powerful Kloppy package allows us to load and standardize data from many providers: Metrica, Sportec, Tracab, SecondSpectrum, StatsPerform and SkillCorner. In this guide we'll use some matches from the [Public SkillCorner Dataset](https://github.com/SkillCorner/opendata).\n",
+    "\n",
+    "<br>\n",
+    "<i>Before we get started it is important to note that the <b>unravelsports</b> library does not have built in functionality to create binary labels, these will need to be supplied by the reader. In this example we use the <b>dummy_labels()</b> functionality that comes with the package. This function creates a single binary label for each frame by randomly assigning it a 0 or 1 value.\n",
+    "</i>\n",
+    "<br>\n",
+    "\n",
+    "##### **Contents**\n",
+    "\n",
+    "- [**1. Imports**](#1-imports).\n",
+    "- [**2. Public SkillCorner Data**](#2-public-skillcorner-data).\n",
+    "- [**3. Graph Converter**](#2-open-skillcorner-data).\n",
+    "- [**4. Load Kloppy Data, Convert & Store**](#4-load-kloppy-data-convert-and-store).\n",
+    "- [**5. Creating a Custom Graph Dataset**](#5-creating-a-custom-graph-dataset).\n",
+    "- [**6. Prepare for Training**](#6-prepare-for-training).\n",
+    "    - [6.1 Split Dataset](#61-split-dataset)\n",
+    "    - [6.2 Model Configurations](#62-model-configurations)\n",
+    "    - [6.3 Build GNN Model](#63-build-gnn-model)\n",
+    "    - [6.4 Create DataLoaders](#64-create-dataloaders)\n",
+    "- [**7. GNN Training + Prediction**](#7-training-and-prediction).\n",
+    "    - [7.1 Compile Model](#71-compile-model)\n",
+    "    - [7.2 Fit Model](#72-fit-model)\n",
+    "    - [7.3 Save & Load Model](#73-save--load-model)\n",
+    "    - [7.4 Evaluate Model](#74-evaluate-model)\n",
+    "    - [7.5 Predict on New Data](#75-predict-on-new-data)\n",
+    "\n",
+    "ℹ️ [**Graphs FAQ**](graphs_faq.md)\n",
+    "\n",
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Imports\n",
+    "\n",
+    "We import `SoccerGraphConverter` to help us convert from Kloppy positional tracking frames to graphs.\n",
+    "\n",
+    "With the power of **Kloppy** we can also load data from many providers by importing `metrica`, `sportec`, `tracab`, `secondspectrum`, or `statsperform` from `kloppy`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unravel.soccer import SoccerGraphConverter\n",
+    "\n",
+    "from kloppy import skillcorner"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Public SkillCorner Data\n",
+    "\n",
+    "The `SoccerGraphConverter` class allows processing data from every tracking data provider supported by [PySports Kloppy](https://github.com/PySport/kloppy), namely:\n",
+    "- Sportec\n",
+    "- Tracab\n",
+    "- SecondSpectrum\n",
+    "- SkillCorner\n",
+    "- StatsPerform\n",
+    "- Metrica\n",
+    "\n",
+    "In this example we're going to use a sample of tracking data from 4 matches of [publicly available SkillCorner data](https://github.com/SkillCorner/opendata). \n",
+    "\n",
+    "All we need to know for now is that this data is from the following matches:\n",
+    "\n",
+    "|  id | date_time           | home_team   | away_team   |\n",
+    "|---:|:---------------------:|:-----------------------|:-----------------------|\n",
+    "|  4039 | 2020-07-02T19:15:00Z | Manchester City        | Liverpool              |\n",
+    "|  3749 | 2020-05-26T16:30:00Z | Dortmund               | Bayern Munchen         |\n",
+    "|  3518 | 2020-03-08T19:45:00Z | Juventus               | Inter                  |\n",
+    "|  3442 | 2020-03-01T20:00:00Z | Real Madrid            | FC Barcelona           |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Graph Converter\n",
+    "\n",
+    "ℹ️ For more information on:\n",
+    "- What a Graph is, check out [Graph FAQ Section A](graphs_faq.ipynb)\n",
+    "- What parameters we can pass to the `SoccerGraphConverter`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n",
+    "- What features each Graph has, check out [Graph FAQ Section C](graphs_faq.ipynb)\n",
+    "\n",
+    "---\n",
+    "\n",
+    "To get started with the `SoccerGraphConverter` we need to pass one _required_ parameter:\n",
+    "- `dataset` (of type `TrackingDataset` (Kloppy)) \n",
+    "\n",
+    "And one parameter that's required when we're converting for training purposes (more on this later):\n",
+    "- `labels` (a dictionary with `frame_id`s as keys and a value of `{True, False, 1 or 0}`).\n",
+    "```python\n",
+    "{83340: True, 83341: False, etc..} = {83340: 1, 83341: 0, etc..} =  {83340: 1, 83341: False, etc..}\n",
+    "```\n",
+    "⚠️ As mentioned before you will need to create your own labels! In this example we'll use `dummy_labels(dataset)` to generate a fake label for each frame.\n",
+    "\n",
+    "#### Graph Identifier(s):\n",
+    "When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. To make this simple, there are two _optional_ parameters we can pass to `SoccerGraphConverter`, namely:\n",
+    "- `graph_id`. This is a single identifier (str or int) for a whole match, for example the unique match id.\n",
+    "- `graph_ids`. This is a dictionary with the same keys as `labels`, but the values are now the unique identifiers. This option can be used if we want to split by sequence or possession_id. For example: {frame_id: 'matchId-sequenceId', frame_id: 'match_Id-sequenceId2'} etc. You will need to create your own ids. Note, if `labels` and `graph_ids` don't have the exact same keys it will throw an error.\n",
+    "\n",
+    "In this example we'll use the `graph_id=match_id` as the unique identifier, but feel free to change that for `graph_ids=dummy_graph_ids(dataset)` to test out that behavior.\n",
+    "\n",
+    "Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### 4. Load Kloppy Data, Convert and Store\n",
+    "\n",
+    "As mentioned in [Section 2](#2-public-skillcorner-data) we will use 4 matches of SkillCorner data. In the below example we will load the first 500 frames of data from each of these 4 games (we set `limit=500`) to create a dataset of 2,000 samples (Note: We're going to actually have less than 2,000 samples because setting `include_empty_frames=False` means we'll skip some frames in our conversion step).\n",
+    "\n",
+    "Important things to note:\n",
+    "- We import `dummy_labels` to randomly generate binary labels. Training with these random labels will not create a good model.\n",
+    "- We import `dummy_graph_ids` to generate fake graph labels.\n",
+    "- The `SoccerGraphConverter` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n",
+    "- We will end up with fewer than 2,000 eventhough we set `limit=500` frames because we set `include_empty_frames=False` and all frames without ball coordinates are automatically ommited.\n",
+    "- When using other providers always set `include_empty_frames=False` or `only_alive=True`.\n",
+    "- We store the data as individual compressed pickle files, one file for per match. The data that gets stored in the pickle is a list of dictionaries, one dictionary per frame. Each dictionary has keys for the adjacency matrix, node features, edge features, label and graph id."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing frames: 100%|██████████| 500/500 [00:02<00:00, 244.81it/s]\n",
+      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.65it/s]\n",
+      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 343.58it/s] \n",
+      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.17it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from os.path import exists\n",
+    "\n",
+    "from unravel.utils import dummy_labels, dummy_graph_ids\n",
+    "\n",
+    "match_ids = [4039, 3749, 3518, 3442]\n",
+    "pickle_folder = \"pickles\"\n",
+    "compressed_pickle_file_path = \"{pickle_folder}/{match_id}.pickle.gz\"\n",
+    "\n",
+    "for match_id in match_ids:\n",
+    "    match_pickle_file_path = compressed_pickle_file_path.format(\n",
+    "        pickle_folder=pickle_folder, match_id=match_id\n",
+    "    )\n",
+    "    # if the output file already exists, skip this whole step\n",
+    "    if not exists(match_pickle_file_path):\n",
+    "\n",
+    "        # Load Kloppy dataset\n",
+    "        dataset = skillcorner.load_open_data(\n",
+    "            match_id=match_id,\n",
+    "            coordinates=\"secondspectrum\",\n",
+    "            include_empty_frames=False,\n",
+    "            limit=500,  # limit to 500 frames in this example\n",
+    "        )\n",
+    "\n",
+    "        # Initialize the Graph Converter, with dataset, labels and settings\n",
+    "        converter = SoccerGraphConverter(\n",
+    "            dataset=dataset,\n",
+    "            # create fake labels\n",
+    "            labels=dummy_labels(dataset),\n",
+    "            graph_id=match_id,\n",
+    "            # graph_ids=dummy_graph_ids(dataset),\n",
+    "            # Settings\n",
+    "            ball_carrier_treshold=25.0,\n",
+    "            max_player_speed=12.0,\n",
+    "            max_ball_speed=28.0,\n",
+    "            boundary_correction=None,\n",
+    "            self_loop_ball=True,\n",
+    "            adjacency_matrix_connect_type=\"ball\",\n",
+    "            adjacency_matrix_type=\"split_by_team\",\n",
+    "            label_type=\"binary\",\n",
+    "            infer_ball_ownership=True,\n",
+    "            infer_goalkeepers=True,\n",
+    "            defending_team_node_value=0.1,\n",
+    "            non_potential_receiver_node_value=0.1,\n",
+    "            random_seed=False,\n",
+    "            pad=True,\n",
+    "            verbose=False,\n",
+    "        )\n",
+    "        # Compute the graphs and directly store them as a pickle file\n",
+    "        converter.to_pickle(file_path=match_pickle_file_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "ℹ️ For a full table of parameters we can pass to the `SoccerGraphConverter` check out [Graph FAQ Section B](graphs_faq.ipynb)\n",
+    "\n",
+    "-----"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 5. Creating a Custom Graph Dataset\n",
+    "\n",
+    "To easily train our model with the Spektral library we need to use a Spektral dataset object. The `CustomSpektralDataset` class helps us create such an object really easily.\n",
+    "\n",
+    "- `CustomSpektralDataset` is a [`spektral.data.Dataset`](https://graphneural.network/creating-dataset/). \n",
+    "This type of dataset makes it very easy to properly load, train and predict with a Spektral GNN.\n",
+    "- The `CustomSpektralDataset` has an option to load from a folder of compressed pickle files, all we have to do is pass the pickle_folder location.\n",
+    "\n",
+    "ℹ️ For more information on the `CustomSpektralDataset` please check the [Graphs FAQ Section D](graphs_faq.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from unravel.utils import CustomSpektralDataset\n",
+    "\n",
+    "dataset = CustomSpektralDataset(pickle_folder=pickle_folder)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 6. Prepare for Training\n",
+    "\n",
+    "Now that we have all the data converted into Graphs inside our `CustomSpektralDataset` object, we can prepare to train the GNN model.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 6.1 Split Dataset\n",
+    "\n",
+    "Our `dataset` object has two custom methods to help split the data into train, test and validation sets.\n",
+    "Either use `dataset.split_test_train()` if we don't need a validation set, or `dataset.split_test_train_validation()` if we do also require a validation set.\n",
+    "\n",
+    "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverter` using the 'graph_id' or 'graph_ids' parameter.\n",
+    "\n",
+    "The 'split_train', 'split_test' and 'split_validation' parameters can either be ratios, percentages or relative size compared to total. \n",
+    "\n",
+    "We opt to create a test, train _and_ validation set to use in our example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train: CustomSpektralDataset(n_graphs=791)\n",
+      "Test: CustomSpektralDataset(n_graphs=477)\n",
+      "Validation: CustomSpektralDataset(n_graphs=336)\n"
+     ]
+    }
+   ],
+   "source": [
+    "train, test, val = dataset.split_test_train_validation(\n",
+    "    split_train=4, split_test=1, split_validation=1, by_graph_id=True, random_seed=42\n",
+    ")\n",
+    "print(\"Train:\", train)\n",
+    "print(\"Test:\", test)\n",
+    "print(\"Validation:\", val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "🗒️ We can see that, because we are splitting by only 4 different graph_ids here (the 4 match_ids) the ratio's aren't perfectly 4 to 1 to 1. If you change the `graph_id=match_id` parameter in the `SoccerGraphConverter` to `graph_ids=dummy_graph_ids(dataset)` you'll see that it's easier to get close to the correct ratios, simply because we have a lot more graph_ids to split a cross. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 6.2 Model Configurations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning_rate = 1e-3\n",
+    "epochs = 5  # Increase for actual training\n",
+    "batch_size = 32\n",
+    "channels = 128\n",
+    "n_layers = 3  # Number of CrystalConv layers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 6.3 Build GNN Model\n",
+    "\n",
+    "This GNN Model has the same architecture as described in [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn/tree/main)\n",
+    "\n",
+    "This exact model can also simply be loaded as:\n",
+    "\n",
+    "`from unravel.classifiers import CrystalGraphClassifier` as shown in [Quick Start Guide](0_quick_start_guide.ipynb)\n",
+    "\n",
+    "Below we show the exact same code to make it easier to adjust."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spektral.layers import GlobalAvgPool, CrystalConv\n",
+    "from tensorflow.keras.layers import Dense, Dropout\n",
+    "from tensorflow.keras.models import Model\n",
+    "\n",
+    "\n",
+    "class CrystalGraphClassifier(Model):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        n_layers: int = 3,\n",
+    "        channels: int = 128,\n",
+    "        drop_out: float = 0.5,\n",
+    "        n_out: int = 1,\n",
+    "        **kwargs\n",
+    "    ):\n",
+    "        super().__init__(**kwargs)\n",
+    "\n",
+    "        self.n_layers = n_layers\n",
+    "        self.channels = channels\n",
+    "        self.drop_out = drop_out\n",
+    "        self.n_out = n_out\n",
+    "\n",
+    "        self.conv1 = CrystalConv()\n",
+    "        self.convs = [CrystalConv() for _ in range(1, self.n_layers)]\n",
+    "        self.pool = GlobalAvgPool()\n",
+    "        self.dense1 = Dense(self.channels, activation=\"relu\")\n",
+    "        self.dropout = Dropout(self.drop_out)\n",
+    "        self.dense2 = Dense(self.channels, activation=\"relu\")\n",
+    "        self.dense3 = Dense(self.n_out, activation=\"sigmoid\")\n",
+    "\n",
+    "    def call(self, inputs):\n",
+    "        x, a, e, i = inputs\n",
+    "        x = self.conv1([x, a, e])\n",
+    "        for conv in self.convs:\n",
+    "            x = conv([x, a, e])\n",
+    "        x = self.pool([x, i])\n",
+    "        x = self.dense1(x)\n",
+    "        x = self.dropout(x)\n",
+    "        x = self.dense2(x)\n",
+    "        x = self.dropout(x)\n",
+    "        return self.dense3(x)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 6.4 Create DataLoaders\n",
+    "\n",
+    "Create a Spektral [`DisjointLoader`](https://graphneural.network/loaders/#disjointloader). This DisjointLoader will help us to load batches of Disjoint Graphs for training purposes.\n",
+    "\n",
+    "Note that these Spektral `Loaders` return a generator, so if we want to retrain the model, we need to reload these loaders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spektral.data import DisjointLoader\n",
+    "\n",
+    "loader_tr = DisjointLoader(train, batch_size=batch_size, epochs=epochs)\n",
+    "loader_va = DisjointLoader(val, epochs=1, shuffle=False, batch_size=batch_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "--------"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 7. Training and Prediction\n",
+    "\n",
+    "Below we outline how to train the model, make predictions and add the predicted values back to the Kloppy dataframe."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.1 Compile Model\n",
+    "\n",
+    "1. Initialize the `CrystalGraphClassifier` (or create your own Graph Classifier).\n",
+    "2. Compile the model with a loss function, optimizer and your preferred metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.metrics import AUC, BinaryAccuracy\n",
+    "from tensorflow.keras.losses import BinaryCrossentropy\n",
+    "from tensorflow.keras.optimizers import Adam\n",
+    "from tensorflow.keras.callbacks import EarlyStopping\n",
+    "\n",
+    "model = CrystalGraphClassifier()\n",
+    "\n",
+    "model.compile(\n",
+    "    loss=BinaryCrossentropy(), optimizer=Adam(), metrics=[AUC(), BinaryAccuracy()]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.2 Fit Model\n",
+    "\n",
+    "1. We have a a [`DisjointLoader`](https://graphneural.network/loaders/#disjointloader) for training and validation sets.\n",
+    "2. Fit the model. \n",
+    "3. We add `EarlyStopping` and a `validation_data` dataset to monitor performance, and set `use_multiprocessing=True` to improve training speed.\n",
+    "\n",
+    "⚠️ When trying to fit the model _again_ make sure to reload Data Loaders in [Section 6.4](#64-create-dataloaders), because they are generators."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.fit(\n",
+    "    loader_tr.load(),\n",
+    "    steps_per_epoch=loader_tr.steps_per_epoch,\n",
+    "    epochs=5,\n",
+    "    use_multiprocessing=True,\n",
+    "    validation_data=loader_va.load(),\n",
+    "    callbacks=[EarlyStopping(monitor=\"loss\", patience=5, restore_best_weights=True)],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.3 Save & Load Model\n",
+    "\n",
+    "This step is solely included to show how to restore a model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.models import load_model\n",
+    "\n",
+    "model_path = \"models/my-first-graph-classifier\"\n",
+    "model.save(model_path)\n",
+    "loaded_model = load_model(model_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.4 Evaluate Model\n",
+    "\n",
+    "1. Create another `DisjointLoader`, this time for the test set.\n",
+    "2. Evaluate model performance on the test set. This evaluation function uses the `metrics` passed to `model.compile`\n",
+    "\n",
+    "🗒️ Our performance is really bad because we're using random labels, very few epochs and a small dataset.\n",
+    "\n",
+    "📖 For more information on evaluation in sports analytics see: [Methodology and evaluation in sports analytics: challenges, approaches, and lessons learned {J. Davis et. al. (2024)}](https://link.springer.com/article/10.1007/s10994-024-06585-0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "15/15 [==============================] - 0s 4ms/step - loss: 0.7250 - auc: 0.5309 - binary_accuracy: 0.5241\n"
+     ]
+    }
+   ],
+   "source": [
+    "loader_te = DisjointLoader(test, epochs=1, shuffle=False, batch_size=batch_size)\n",
+    "results = model.evaluate(loader_te.load())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 7.5 Predict on New Data\n",
+    "\n",
+    "1. Load new, unseen data from the SkillCorner dataset.\n",
+    "2. Convert this data, making sure we use the exact same settings as in step 1.\n",
+    "3. If we set `prediction=True` we do not have to supply labels to the `SoccerGraphConverter`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kloppy_dataset = skillcorner.load_open_data(\n",
+    "    match_id=2068,  # A game we have not yet used in section 4\n",
+    "    include_empty_frames=False,\n",
+    "    limit=500,\n",
+    ")\n",
+    "\n",
+    "preds_converter = SoccerGraphConverter(\n",
+    "    dataset=kloppy_dataset,\n",
+    "    prediction=True,\n",
+    "    ball_carrier_treshold=25.0,\n",
+    "    max_player_speed=12.0,\n",
+    "    max_ball_speed=28.0,\n",
+    "    boundary_correction=None,\n",
+    "    self_loop_ball=True,\n",
+    "    adjacency_matrix_connect_type=\"ball\",\n",
+    "    adjacency_matrix_type=\"split_by_team\",\n",
+    "    label_type=\"binary\",\n",
+    "    infer_ball_ownership=True,\n",
+    "    infer_goalkeepers=True,\n",
+    "    defending_team_node_value=0.1,\n",
+    "    non_potential_receiver_node_value=0.1,\n",
+    "    random_seed=False,\n",
+    "    pad=True,\n",
+    "    verbose=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "4. Make a prediction on all the frames of this dataset using `model.predict`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 326.02it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "11/11 [==============================] - 0s 4ms/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Compute the graphs and add them to the CustomSpektralDataset\n",
+    "pred_dataset = CustomSpektralDataset(graphs=preds_converter.to_spektral_graphs())\n",
+    "\n",
+    "loader_pred = DisjointLoader(\n",
+    "    pred_dataset, batch_size=batch_size, epochs=1, shuffle=False\n",
+    ")\n",
+    "preds = model.predict(loader_pred.load(), use_multiprocessing=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "5. Convert Klopy dataset to a dataframe and merge back the pedictions using the frame_ids."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>frame_id</th>\n",
+       "      <th>period_id</th>\n",
+       "      <th>timestamp</th>\n",
+       "      <th>y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>300</th>\n",
+       "      <td>2166</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:00:33.300000</td>\n",
+       "      <td>0.259016</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>301</th>\n",
+       "      <td>2167</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:00:33.400000</td>\n",
+       "      <td>0.251124</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>302</th>\n",
+       "      <td>2168</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:00:33.500000</td>\n",
+       "      <td>0.258305</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>303</th>\n",
+       "      <td>2169</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:00:33.600000</td>\n",
+       "      <td>0.256378</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>304</th>\n",
+       "      <td>2170</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:00:33.700000</td>\n",
+       "      <td>0.305434</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     frame_id  period_id              timestamp         y\n",
+       "300      2166          1 0 days 00:00:33.300000  0.259016\n",
+       "301      2167          1 0 days 00:00:33.400000  0.251124\n",
+       "302      2168          1 0 days 00:00:33.500000  0.258305\n",
+       "303      2169          1 0 days 00:00:33.600000  0.256378\n",
+       "304      2170          1 0 days 00:00:33.700000  0.305434"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "kloppy_df = kloppy_dataset.to_df()\n",
+    "\n",
+    "preds_df = pd.DataFrame(\n",
+    "    {\"frame_id\": [x.id for x in pred_dataset], \"y\": preds.flatten()}\n",
+    ")\n",
+    "\n",
+    "kloppy_df = pd.merge(kloppy_df, preds_df, on=\"frame_id\", how=\"left\")\n",
+    "\n",
+    "kloppy_df[300:305][[\"frame_id\", \"period_id\", \"timestamp\", \"y\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "🗒️ Not all frames have a prediction because of missing (ball) data, so we look at the 300th-305th frame."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/graphs_faq.md b/examples/graphs_faq.md
index 8ea6495..fa50b53 100644
--- a/examples/graphs_faq.md
+++ b/examples/graphs_faq.md
@@ -51,10 +51,6 @@ In section 6.1 we can see what this looks like in Python.
 | `max_ball_acceleration`                    | float     | The maximum speed of the ball in yards per second squared. Used for normalizing node features.                                                                                                                                                                                                                                                                                                   | 10.0            | 🏈 |
 | `attacking_non_qb_node_value` | float     | Value for the node feature when player is NOT the QB, but is on the attacking team                                                                                                                                                                                                  | 0.1             | 🏈  |
 | `chunk_size` | int     | Set to determine size of conversions from Polars to Graphs. Preferred setting depends on available computing power                                                                                                                                                                                                              | 2_000           | 🏈 |
-| `ball_carrier_threshold`            | float     | The distance threshold to determine the ball carrier in meters. If no ball carrier within ball_carrier_threshold, we skip the frame.                                                                                                                                                                                                                                                      | 25.0            | ⚽ |
-| `boundary_correction`               | float     | A correction factor for boundary calculations, used to correct out of bounds as a percentage (Used as 1+boundary_correction, i.e., 0.05). Not setting this might lead to players outside the pitch markings to have values that fall slightly outside of our normalization range. When we set boundary_correction, any players outside the pitch will be moved to be on the closest line. | None            | ⚽ |
-| `infer_ball_ownership`              | bool      | Infers 'attacking_team' if no 'ball_owning_team' exist (in Kloppy TrackingDataset) by finding the player closest to the ball using ball xyz, uses 'ball_carrier_threshold' as a cut-off.                                                                                                                                                                                                  | True            | ⚽ |
-| `infer_goalkeepers`                 | bool      | Set True if no GK label is provided, set False for incomplete (broadcast tracking) data that might not have a GK in every frame.                                                                                                                                                                                                                                                          | True            | ⚽ |
 | `non_potential_receiver_node_value` | float     | Value for the node feature when player is NOT a potential receiver of a pass (when on opposing team or in possession of the ball). Should be between 0 and 1 including.                                                                                                                                                                                                                   | 0.1             | ⚽ |
 
 
@@ -64,7 +60,7 @@ In section 6.1 we can see what this looks like in Python.
 #### C. What features does each Graph have?
 
 <details>
-    <summary> <b><i> 🌀 ⚽  Expand for a full list of Soccer features </b></i></summary>
+    <summary> <b><i> 🌀 ⚽  Expand for a full list of Soccer features (note: `SoccerGraphConverter`, `SoccerGraphConverterPolars` has slightly different features) </b></i></summary>
     
 | Variable | Datatype                          | Index | Features                                                                                                                        |
 |----------|-----------------------------------|-------|---------------------------------------------------------------------------------------------------------------------------------|
diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py
index 5beeb82..703c57d 100644
--- a/unravel/soccer/graphs/graph_converter_pl.py
+++ b/unravel/soccer/graphs/graph_converter_pl.py
@@ -44,6 +44,9 @@ class SoccerGraphConverterPolars(DefaultGraphConverter):
     non_potential_receiver_node_value: float = 0.1
 
     def __post_init__(self):
+        if not isinstance(self.dataset, KloppyPolarsDataset):
+            raise ValueError("dataset should be of type KloppyPolarsDataset...")
+
         self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions
         self.label_column: str = (
             self.label_col if self.label_col is not None else self.dataset._label_column
@@ -298,14 +301,14 @@ def __compute(self, args: List[pl.Series]) -> dict:
         d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)}
 
         if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]):
-            raise Exception(
-                "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
+            raise ValueError(
+                "graph_id selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..."
             )
 
         if not self.prediction and not np.all(
             d[self.label_column] == d[self.label_column][0]
         ):
-            raise Exception(
+            raise ValueError(
                 """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, 
                 make sure this is not the case. Each group can only have 1 label."""
             )

From 77ab8c2c0f1ff1e1eed531c5a1d5798708f8d6d9 Mon Sep 17 00:00:00 2001
From: "UnravelSports [JB]" <jors@unravelsports.com>
Date: Mon, 27 Jan 2025 10:02:48 +0100
Subject: [PATCH 10/10] black jupyter

---
 examples/1_kloppy_gnn_train.ipynb | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/1_kloppy_gnn_train.ipynb b/examples/1_kloppy_gnn_train.ipynb
index 39aee88..08fb650 100644
--- a/examples/1_kloppy_gnn_train.ipynb
+++ b/examples/1_kloppy_gnn_train.ipynb
@@ -266,16 +266,15 @@
     "            match_id=match_id,\n",
     "            coordinates=\"secondspectrum\",\n",
     "            include_empty_frames=False,\n",
-    "            limit=500,  \n",
+    "            limit=500,\n",
     "        )\n",
     "        dataset = KloppyPolarsDataset(\n",
-    "            kloppy_dataset=kloppy_dataset,\n",
-    "            ball_carrier_threshold=25.0\n",
+    "            kloppy_dataset=kloppy_dataset, ball_carrier_threshold=25.0\n",
     "        )\n",
     "        dataset.load()\n",
-    "        \n",
+    "\n",
     "        dataset.add_graph_ids()\n",
-    "        \n",
+    "\n",
     "        dataset.add_dummy_labels()\n",
     "\n",
     "        # Initialize the Graph Converter, with dataset, labels and settings\n",