From f73aed7783d290a836b490654cffa59aa8abff36 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Tue, 26 Nov 2024 18:36:26 +0100 Subject: [PATCH 01/10] intermediate --- unravel/american_football/graphs/dataset.py | 4 +- .../graphs/graph_converter.py | 11 +- unravel/soccer/graphs/__init__.py | 4 + unravel/soccer/graphs/dataset.py | 330 +++++++++++++++ unravel/soccer/graphs/features/__init__.py | 4 + .../graphs/features/adjacency_matrix_pl.py | 42 ++ .../graphs/features/edge_features_pl.py | 185 ++++++++ .../graphs/features/node_features_pl.py | 244 +++++++++++ unravel/soccer/graphs/graph_converter_pl.py | 400 ++++++++++++++++++ unravel/soccer/graphs/graph_settings_pl.py | 38 ++ unravel/utils/features/utils.py | 41 ++ unravel/utils/objects/__init__.py | 1 + unravel/utils/objects/default_dataset.py | 13 + 13 files changed, 1310 insertions(+), 7 deletions(-) create mode 100644 unravel/soccer/graphs/dataset.py create mode 100644 unravel/soccer/graphs/features/adjacency_matrix_pl.py create mode 100644 unravel/soccer/graphs/features/edge_features_pl.py create mode 100644 unravel/soccer/graphs/features/node_features_pl.py create mode 100644 unravel/soccer/graphs/graph_converter_pl.py create mode 100644 unravel/soccer/graphs/graph_settings_pl.py create mode 100644 unravel/utils/objects/default_dataset.py diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py index 93368f5..fdb7310 100644 --- a/unravel/american_football/graphs/dataset.py +++ b/unravel/american_football/graphs/dataset.py @@ -7,11 +7,11 @@ import numpy as np from .graph_settings import AmericanFootballPitchDimensions, Dimension, Unit -from ...utils import add_dummy_label_column, add_graph_id_column +from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column @dataclass -class BigDataBowlDataset: +class BigDataBowlDataset(DefaultDataset): tracking_file_path: str players_file_path: str plays_file_path: str diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py index 2f7bfd9..172164d 100644 --- a/unravel/american_football/graphs/graph_converter.py +++ b/unravel/american_football/graphs/graph_converter.py @@ -74,11 +74,6 @@ def _sport_specific_checks(self): if not isinstance(self.chunk_size, int): raise Exception("chunk_size should be of type integer (int)") - if not isinstance(self.attacking_non_qb_node_value, (int, float)): - raise Exception( - "'attacking_non_qb_node_value' should be of type float or integer (int)" - ) - if not self.label_col in self.dataset.columns and not self.prediction: raise Exception( "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on." @@ -89,6 +84,12 @@ def _sport_specific_checks(self): "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." ) + # Parameter Checks + if not isinstance(self.attacking_non_qb_node_value, (int, float)): + raise Exception( + "'attacking_non_qb_node_value' should be of type float or integer (int)" + ) + def _apply_settings(self): return AmericanFootballGraphSettings( pitch_dimensions=self.pitch_dimensions, diff --git a/unravel/soccer/graphs/__init__.py b/unravel/soccer/graphs/__init__.py index 905585c..bd44fac 100644 --- a/unravel/soccer/graphs/__init__.py +++ b/unravel/soccer/graphs/__init__.py @@ -1,5 +1,9 @@ from .graph_converter import SoccerGraphConverter +from .graph_converter_pl import SoccerGraphConverterPL from .graph_settings import SoccerGraphSettings +from .graph_settings_pl import GraphSettingsPL from .graph_frame import GraphFrame from .exceptions import * from .features import * + +from .dataset import KloppyDataset diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py new file mode 100644 index 0000000..459b1d6 --- /dev/null +++ b/unravel/soccer/graphs/dataset.py @@ -0,0 +1,330 @@ +from kloppy.domain import ( + TrackingDataset, + Frame, + Orientation, + DatasetTransformer, + DatasetFlag, + SecondSpectrumCoordinateSystem, +) + +from typing import List, Dict, Union + +from dataclasses import field, dataclass + +from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column + +import polars as pl + + +DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 2} +DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 2} + + +@dataclass +class SoccerObject: + id: Union[str, int] + team_id: Union[str, int] + position_name: str + + +@dataclass +class KloppyDataset(DefaultDataset): + kloppy_dataset: TrackingDataset + _identifier_column: str = field(default="id", init=False) + _partition_by: List[str] = field( + default_factory=lambda: ["id", "period_id"], init=False + ) + + def __transform_orientation(self): + if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM: + to_orientation = Orientation.STATIC_HOME_AWAY + else: + to_orientation = Orientation.BALL_OWNING_TEAM + + self.kloppy_dataset = DatasetTransformer.transform_dataset( + dataset=self.kloppy_dataset, + to_orientation=to_orientation, + to_coordinate_system=SecondSpectrumCoordinateSystem( + pitch_length=self.kloppy_dataset.metadata.pitch_dimensions.pitch_length, + pitch_width=self.kloppy_dataset.metadata.pitch_dimensions.pitch_width, + ), + ) + return self.kloppy_dataset + + def __get_objects(self): + home_team, away_team = self.kloppy_dataset.metadata.teams + + home_players = [ + SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) + for p in home_team.players + ] + away_players = [ + SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) + for p in away_team.players + ] + ball_object = SoccerObject("ball", None, "ball") + game_id = self.kloppy_dataset.metadata.game_id + return (home_players, away_players, ball_object, game_id) + + def __unpivot(self, object, coordinate): + column = f"{object.id}_{coordinate}" + + return self.data.unpivot( + index=[ + "period_id", + "timestamp", + "frame_id", + "ball_state", + "ball_owning_team_id", + ], # Columns to keep + on=[column], + value_name=coordinate, + variable_name=self._identifier_column, + ).with_columns( + pl.col(self._identifier_column).str.replace( + f"_{coordinate}", "" + ) # Remove the coordinate suffix + ) + + def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict): + try: + from scipy.signal import savgol_filter + except ImportError: + raise ImportError( + "Seems like you don't have scipy installed. Please" + " install it using: pip install scipy" + ) + + if not smoothing_params.get("window_length"): + raise ValueError( + "Missing parameter 'window_length' in player_smoothing_params and/or ball_smoothing_params" + ) + if not smoothing_params.get("polyorder"): + raise ValueError( + "Missing parameter 'polyorder' in player_smoothing_params and/or ball_smoothing_params" + ) + + smoothed = df.group_by(self._partition_by, maintain_order=True).agg( + [ + pl.col("vx") + .map_elements( + lambda vx: savgol_filter( + vx, + window_length=smoothing_params["window_length"], + polyorder=smoothing_params["polyorder"], + ).tolist(), + return_dtype=pl.List(pl.Float64), + ) + .alias("vx_smoothed"), + pl.col("vy") + .map_elements( + lambda vy: savgol_filter( + vy, + window_length=smoothing_params["window_length"], + polyorder=smoothing_params["polyorder"], + ).tolist(), + return_dtype=pl.List(pl.Float64), + ) + .alias("vy_smoothed"), + pl.col("vz") + .map_elements( + lambda vy: savgol_filter( + vy, + window_length=smoothing_params["window_length"], + polyorder=smoothing_params["polyorder"], + ).tolist(), + return_dtype=pl.List(pl.Float64), + ) + .alias("vz_smoothed"), + ] + ) + # Explode the smoothed columns back to original shape + smoothed_exploded = smoothed.explode( + ["vx_smoothed", "vy_smoothed", "vz_smoothed"] + ) + # Combine with the original DataFrame if needed + return df.with_columns( + vx=smoothed_exploded["vx_smoothed"], + vy=smoothed_exploded["vy_smoothed"], + vz=smoothed_exploded["vz_smoothed"], + ) + + def __add_velocity( + self, + df: pl.DataFrame, + player_smoothing_params: dict, + ball_smoothing_params: dict, + ): + df = ( + df.sort(["id", "period_id", "timestamp", "team_id"], nulls_last=True) + .with_columns( + [ + # Calculate differences within each group + pl.col("x").diff().over(self._partition_by).alias("dx"), + pl.col("y").diff().over(self._partition_by).alias("dy"), + pl.col("z").diff().over(self._partition_by).alias("dz"), + (pl.col("timestamp").dt.total_milliseconds() / 1_000) + .diff() + .over(self._partition_by) + .alias("dt"), + ] + ) + .with_columns( + [ + # Compute velocity components + (pl.col("dx") / pl.col("dt")).alias("vx"), + (pl.col("dy") / pl.col("dt")).alias("vy"), + (pl.col("dz") / pl.col("dt")).alias("vz"), + ] + ) + .with_columns( + [ + # Fill null values in vx and vy + pl.col("vx").fill_null(0).alias("vx"), + pl.col("vy").fill_null(0).alias("vy"), + pl.col("vz").fill_null(0).alias("vz"), + ] + ) + ) + + if player_smoothing_params: + player_df = self.__apply_smoothing( + df=df.filter(pl.col(self._identifier_column) != self._ball_object.id), + smoothing_params=player_smoothing_params, + ) + else: + player_df = df.filter( + pl.col(self._identifier_column) != self._ball_object.id + ) + + if ball_smoothing_params: + ball_df = self.__apply_smoothing( + df.filter(pl.col(self._identifier_column) == self._ball_object.id), + smoothing_params=ball_smoothing_params, + ) + else: + ball_df = df.filter(pl.col(self._identifier_column) == self._ball_object.id) + df = pl.concat([player_df, ball_df]) + df = df.with_columns( + [ + (pl.col("vx") ** 2 + pl.col("vy") ** 2 + pl.col("vz") ** 2) + .sqrt() + .alias("v") + ] + ) + + return df + + def __add_acceleration(self, df: pl.DataFrame): + df = ( + df.with_columns( + [ + # Calculate differences in vx, vy, and dt for acceleration + pl.col("vx").diff().over(self._partition_by).alias("dvx"), + pl.col("vy").diff().over(self._partition_by).alias("dvy"), + pl.col("vz").diff().over(self._partition_by).alias("dvz"), + ] + ) + .with_columns( + [ + # Compute ax and ay + (pl.col("dvx") / pl.col("dt")).alias("ax"), + (pl.col("dvy") / pl.col("dt")).alias("ay"), + (pl.col("dvz") / pl.col("dt")).alias("az"), + ] + ) + .with_columns( + [ + # Fill null values in vx and vy + pl.col("ax").fill_null(0).alias("ax"), + pl.col("ay").fill_null(0).alias("ay"), + pl.col("az").fill_null(0).alias("az"), + ] + ) + .with_columns( + [ + # Compute magnitude of acceleration a + (pl.col("ax") ** 2 + pl.col("ay") ** 2 + pl.col("az") ** 2) + .sqrt() + .alias("a") + ] + ) + ) + return df + + def __melt( + self, + home_players: List[SoccerObject], + away_players: List[SoccerObject], + ball_object: SoccerObject, + game_id: Union[int, str], + ): + melted_dfs = [] + columns = self.data.columns + + for object in [ball_object] + home_players + away_players: + melted_object_dfs = [] + for k, coordinate in enumerate(["x", "y", "z"]): + if object.id != "ball" and coordinate == "z": + continue + if not any(object.id in column for column in columns): + continue + + melted_df = self.__unpivot(object, coordinate) + if k == 0: + melted_object_dfs.append(melted_df) + else: + melted_object_dfs.append(melted_df[[coordinate]]) + + if melted_object_dfs: + object_df = pl.concat(melted_object_dfs, how="horizontal") + if "z" not in object_df.columns: + object_df = object_df.with_columns([pl.lit(0.0).alias("z")]) + object_df = object_df.with_columns( + [ + pl.lit(object.team_id).cast(pl.Utf8).alias("team_id"), + pl.lit(object.position_name).alias("position_name"), + ] + ) + + melted_dfs.append(object_df) + + df = pl.concat(melted_dfs, how="vertical") + df = df.with_columns([pl.lit(game_id).alias("game_id")]) + df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True) + return df + + def load( + self, + player_smoothing_params: Union[dict, None] = DEFAULT_PLAYER_SMOOTHING_PARAMS, + ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS, + ): + self.kloppy_dataset = self.__transform_orientation() + self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions + + self.data = self.kloppy_dataset.to_df(engine="polars") + (self._home_players, self._away_players, self._ball_object, self._game_id) = ( + self.__get_objects() + ) + df = self.__melt( + self._home_players, self._away_players, self._ball_object, self._game_id + ) + df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params) + df = self.__add_acceleration(df) + self.data = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"]) + + return self.data, self.pitch_dimensions + + def add_dummy_labels( + self, + by: List[str] = ["game_id", "frame_id"], + column_name: str = "label", + ) -> pl.DataFrame: + self.data = add_dummy_label_column(self.data, by, column_name) + return self.data + + def add_graph_ids( + self, by: List[str] = ["game_id", "period_id"], column_name: str = "graph_id" + ) -> pl.DataFrame: + self.data = add_graph_id_column(self.data, by, column_name) + return self.data diff --git a/unravel/soccer/graphs/features/__init__.py b/unravel/soccer/graphs/features/__init__.py index 0a8744a..4135270 100644 --- a/unravel/soccer/graphs/features/__init__.py +++ b/unravel/soccer/graphs/features/__init__.py @@ -1,3 +1,7 @@ from .adjacency_matrix import adjacency_matrix, delaunay_adjacency_matrix from .edge_features import edge_features from .node_features import node_features + +from .adjacency_matrix_pl import compute_adjacency_matrix_pl +from .edge_features_pl import compute_edge_features_pl +from .node_features_pl import compute_node_features_pl diff --git a/unravel/soccer/graphs/features/adjacency_matrix_pl.py b/unravel/soccer/graphs/features/adjacency_matrix_pl.py new file mode 100644 index 0000000..7a5b2d2 --- /dev/null +++ b/unravel/soccer/graphs/features/adjacency_matrix_pl.py @@ -0,0 +1,42 @@ +import numpy as np +from scipy.spatial import Delaunay + + +from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType, distance_to_ball + + +def compute_adjacency_matrix_pl(team, possession_team, settings, ball_carrier_idx): + adjacency_matrix_type = settings.adjacency_matrix_type + adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type + ball_id = settings.ball_id + + exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)]) + defensive_team = np.setdiff1d(team, exclusion_ids)[0] + if adjacency_matrix_type == AdjacencyMatrixType.DENSE: + adjacency_matrix = np.ones((team.shape[0], team.shape[0])).astype(np.int32) + elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_AP: + is_att = team == np.unique(possession_team)[0] + adjacency_matrix = np.outer(is_att, is_att).astype(int) + elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_DP: + is_def = team == defensive_team + adjacency_matrix = np.outer(is_def, is_def).astype(int) + elif adjacency_matrix_type == AdjacencyMatrixType.SPLIT_BY_TEAM: + # Create a pairwise team comparison matrix + adjacency_matrix = np.equal(team[:, None], team[None, :]).astype(np.int32) + elif adjacency_matrix_type == AdjacencyMatrixType.DELAUNAY: + raise NotImplementedError("Delaunay matrix not implemented for Soccer...") + else: + raise NotImplementedError("Please specify an existing AdjacencyMatrixType...") + + if adjacency_matrix_connect_type: + # Create a mask where either team is "ball" + ball_mask = (team[:, None] == ball_id) | (team[None, :] == ball_id) + if adjacency_matrix_connect_type == AdjacenyMatrixConnectType.BALL: + # Set entries to 1 where either team is "ball" + adjacency_matrix = np.where(ball_mask, 1, adjacency_matrix) + elif adjacency_matrix_connect_type == AdjacenyMatrixConnectType.BALL_CARRIER: + if ball_carrier_idx is not None: + adjacency_matrix[ball_carrier_idx, ball_mask[ball_carrier_idx, :]] = 1 + adjacency_matrix[ball_mask[:, ball_carrier_idx], ball_carrier_idx] = 1 + + return adjacency_matrix diff --git a/unravel/soccer/graphs/features/edge_features_pl.py b/unravel/soccer/graphs/features/edge_features_pl.py new file mode 100644 index 0000000..3852e6d --- /dev/null +++ b/unravel/soccer/graphs/features/edge_features_pl.py @@ -0,0 +1,185 @@ +import numpy as np + +from ....utils import ( + normalize_distance, + normalize_speed, + normalize_sincos, + angle_between, + non_zeros, + reindex, +) + +import numpy as np + +from ....utils import ( + normalize_distance, + normalize_sincos, + non_zeros, + reindex, + normalize_speed_differences_nfl, + normalize_accelerations_nfl, +) + + +def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, settings): + # Compute pairwise distances using broadcasting + max_dist_to_player = np.sqrt( + settings.pitch_dimensions.pitch_length**2 + + settings.pitch_dimensions.pitch_width**2 + ) + + distances_between_players = np.linalg.norm( + p3d[:, None, :] - p3d[None, :, :], axis=-1 + ) + dist_matrix_normed = normalize_distance( + distances_between_players, max_distance=max_dist_to_player + ) # 11x11 + + speed_diff_matrix = np.nan_to_num(s[None, :] - s[:, None]) # NxNx1 + speed_diff_matrix_normed = normalize_speed_differences_nfl( + s=speed_diff_matrix, + team=team, + settings=settings, + ) + + vect_to_player_matrix = p2d[:, None, :] - p2d[None, :, :] # NxNx2 + + v_normed_matrix = velocity[None, :, :] - velocity[:, None, :] # 11x11x2 + + vect_to_player_matrix = ( + p2d[:, None, :] - p2d[None, :, :] + ) # 11x11x2 the vector between two players + + # Angles between players in sin and cos + angle_pos_matrix = np.nan_to_num( + np.arctan2(vect_to_player_matrix[:, :, 1], vect_to_player_matrix[:, :, 0]) + ) + pos_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_pos_matrix))) + pos_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_pos_matrix))) + + combined_matrix = np.concatenate((vect_to_player_matrix, v_normed_matrix), axis=2) + angle_vel_matrix = np.apply_along_axis(angle_between, 2, combined_matrix) + vel_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_vel_matrix))) + vel_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_vel_matrix))) + + nan_mask = np.isnan(distances_between_players) + non_zero_idxs, len_a = non_zeros(A=adjacency_matrix) + + dist_matrix_normed[nan_mask] = 0 + speed_diff_matrix_normed[nan_mask] = 0 + + pos_cos_matrix[nan_mask] = 0 + pos_sin_matrix[nan_mask] = 0 + + e_tuple = list( + [ + reindex(dist_matrix_normed, non_zero_idxs, len_a), + reindex(speed_diff_matrix_normed, non_zero_idxs, len_a), + reindex(pos_cos_matrix, non_zero_idxs, len_a), + reindex(pos_sin_matrix, non_zero_idxs, len_a), + reindex(vel_cos_matrix, non_zero_idxs, len_a), + reindex(vel_sin_matrix, non_zero_idxs, len_a), + ] + ) + + e = np.concatenate(e_tuple, axis=1) + return np.nan_to_num(e) + + +# def edge_features( +# attacking_players, +# defending_players, +# ball, +# max_player_speed, +# max_ball_speed, +# pitch_dimensions, +# adjacency_matrix, +# delaunay_adjacency_matrix, +# ): +# """ +# # edge features matrix is (np.non_zero(a), n_edge_features) (nz, n_edge_features) +# # so for every connected edge in the adjacency matrix (a) we have 1 row of features describing that edge +# # to do this we compute all values for a single feature in a <=23x23 square matrix +# # reshape it to a (<=23**2, ) matrix and then mask all values that are 0 in `a` (nz) +# # then we concat all the features into a single (nz, n_edge_features) matrix +# """ + +# max_dist_to_player = np.sqrt( +# pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2 +# ) + +# players1 = players2 = attacking_players + defending_players + [ball] + +# h_pos = np.asarray([p.position for p in players1]) +# a_pos = np.asarray([p.position for p in players2]) + +# h_vel = np.asarray([p.velocity for p in players1]) +# a_vel = np.asarray([p.velocity for p in players2]) + +# h_spe = np.asarray([p.speed for p in players1]) +# a_spe = np.asarray([p.speed for p in players2]) + +# distances_between_players = np.linalg.norm( +# h_pos[:, None, :] - a_pos[None, :, :], axis=-1 +# ) +# nan_mask = np.isnan(distances_between_players) + +# dist_matrix = normalize_distance( +# distances_between_players, max_distance=max_dist_to_player +# ) # 11x11 + +# speed_diff_matrix = np.nan_to_num( +# normalize_speed(a_spe[None, :], max_speed=max(max_player_speed, max_ball_speed)) +# - normalize_speed( +# h_spe[:, None], max_speed=max(max_player_speed, max_ball_speed) +# ) +# ) # 11x11x1 + +# vect_to_player_matrix = ( +# h_pos[:, None, :] - a_pos[None, :, :] +# ) # 11x11x2 the vector between two players +# v_normed_matrix = a_vel[None, :, :] - h_vel[:, None, :] # 11x11x2 + +# angle_pos_matrix = np.nan_to_num( +# np.arctan2(vect_to_player_matrix[:, :, 1], vect_to_player_matrix[:, :, 0]) +# ) +# pos_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_pos_matrix))) +# pos_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_pos_matrix))) + +# combined_matrix = np.concatenate((vect_to_player_matrix, v_normed_matrix), axis=2) +# angle_vel_matrix = np.apply_along_axis(angle_between, 2, combined_matrix) +# vel_cos_matrix = normalize_sincos(np.nan_to_num(np.cos(angle_vel_matrix))) +# vel_sin_matrix = normalize_sincos(np.nan_to_num(np.sin(angle_vel_matrix))) + +# non_zero_idxs, len_a = non_zeros(A=adjacency_matrix) +# # create a matrix where 1 if edge is same team else 0 + +# # if we have nan values we mask them to 0. +# # this only happens when we pad additional players +# dist_matrix[nan_mask] = 0 +# speed_diff_matrix[nan_mask] = 0 +# pos_cos_matrix[nan_mask] = 0 +# pos_sin_matrix[nan_mask] = 0 +# vel_cos_matrix[nan_mask] = 0 +# vel_sin_matrix[nan_mask] = 0 + +# e_tuple = list( +# [ +# # same_team_matrix[non_zero_idxs].reshape(len_a, 1), +# reindex(dist_matrix, non_zero_idxs, len_a), +# reindex(speed_diff_matrix, non_zero_idxs, len_a), +# reindex(pos_cos_matrix, non_zero_idxs, len_a), +# reindex(pos_sin_matrix, non_zero_idxs, len_a), +# reindex(vel_cos_matrix, non_zero_idxs, len_a), +# reindex(vel_sin_matrix, non_zero_idxs, len_a), +# ] +# ) + +# if delaunay_adjacency_matrix is not None: +# # if we are not using Delaunay as adjacency matrix, +# # use it as edge features to indicate "clear passing lines" +# extra_tuple = list([reindex(delaunay_adjacency_matrix, non_zero_idxs, len_a)]) +# e_tuple.extend(extra_tuple) + +# e = np.concatenate(e_tuple, axis=1) +# return np.nan_to_num(e) diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py new file mode 100644 index 0000000..c1132b3 --- /dev/null +++ b/unravel/soccer/graphs/features/node_features_pl.py @@ -0,0 +1,244 @@ +import math +import numpy as np + +from ....utils import ( + normalize_coords, + normalize_speeds_nfl, + normalize_sincos, + normalize_distance, + unit_vector_from_angle, + normalize_speeds_nfl, + normalize_accelerations_nfl, + normalize_between, + unit_vector, + unit_vectors, + normalize_angles, + normalize_distance, + normalize_coords, + normalize_speed, + distance_to_ball, +) + + +def compute_node_features_pl( + x, + y, + s, + velocity, + team, + possession_team, + is_gk, + settings, +): + ball_id = settings.ball_id + + goal_mouth_position = ( + settings.pitch_dimensions.x_dim.max, + (settings.pitch_dimensions.y_dim.max + settings.pitch_dimensions.y_dim.min) / 2, + ) + max_dist_to_player = np.sqrt( + settings.pitch_dimensions.pitch_length**2 + + settings.pitch_dimensions.pitch_width**2 + ) + max_dist_to_goal = np.sqrt( + settings.pitch_dimensions.pitch_length**2 + + settings.pitch_dimensions.pitch_width**2 + ) + + position, ball_position, dist_to_ball = distance_to_ball( + x=x, y=y, team=team, ball_id=ball_id + ) + + x_normed = normalize_between( + value=x, + max_value=settings.pitch_dimensions.x_dim.max, + min_value=settings.pitch_dimensions.x_dim.min, + ) + y_normed = normalize_between( + value=y, + max_value=settings.pitch_dimensions.y_dim.max, + min_value=settings.pitch_dimensions.y_dim.min, + ) + s_normed = normalize_speeds_nfl(s, team, settings) + uv_velocity = unit_vectors(velocity) + + angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0])) + sin_normed = normalize_sincos(np.sin(angles)) + cos_normed = normalize_sincos(np.cos(angles)) + + dist_to_goal = np.linalg.norm(position - goal_mouth_position, axis=1) + normed_dist_to_goal = normalize_distance( + value=dist_to_goal, max_distance=max_dist_to_goal + ) + + normed_dist_to_ball = normalize_distance( + value=dist_to_ball, max_distance=max_dist_to_player + ) + + is_possession_team = np.where( + team == possession_team, 1, settings.defending_team_node_value + ) + + is_ball = np.where(team == ball_id, 1, 0) + + X = np.nan_to_num( + np.stack( + ( + x_normed, + y_normed, + uv_velocity[:, 0], + uv_velocity[:, 1], + s_normed, + sin_normed, + cos_normed, + normed_dist_to_goal, + normed_dist_to_ball, + is_possession_team, + is_gk, + is_ball, + ), + axis=-1, + ) + ) + + return X + + +# def node_features( +# attacking_players, +# defending_players, +# ball, +# max_player_speed, +# max_ball_speed, +# ball_carrier_idx, +# pitch_dimensions, +# include_ball_node: bool = True, +# defending_team_node_value: float = 0.1, +# non_potential_receiver_node_value: float = 0.1, +# ): +# """ +# node features matrix is (n_nodes, n_node_features) (<=23, 17) +# each player (and optionally ball) is a node + +# player_features n_node_features must be equal to ball_features n_node_features +# """ + +# goal_mouth_position = ( +# pitch_dimensions.pitch_length, +# pitch_dimensions.pitch_width / 2, +# ) +# max_dist_to_player = np.sqrt( +# pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2 +# ) +# max_dist_to_goal = np.sqrt( +# pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2 +# ) + +# def player_features(p, team, potential_receiver=None): +# ball_angle = math.atan2(p.y1 - ball.y1, p.x1 - ball.x1) +# goal_angle = math.atan2( +# p.y1 - goal_mouth_position[0], p.x1 - goal_mouth_position[1] +# ) + +# player_node_features = [ +# ( +# 0.0 +# if np.isnan(p.x1) +# else normalize_coords(p.x1, pitch_dimensions.x_dim.max) +# ), +# ( +# 0.0 +# if np.isnan(p.x1) +# else normalize_coords(p.y1, pitch_dimensions.y_dim.max) +# ), +# 0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0], +# 0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[1], +# ( +# 0.0 +# if np.isnan(p.x1) +# else round(normalize_speed(p.speed, max_speed=max_player_speed), 3) +# ), +# ( +# 0.0 +# if np.isnan(p.x1) +# else normalize_angles(np.arctan2(p.velocity[1], p.velocity[0])) +# ), +# ( +# 0.0 +# if np.isnan(p.x1) +# else normalize_distance( +# np.linalg.norm(p.position - goal_mouth_position), +# max_distance=max_dist_to_goal, +# ) +# ), # distance to the goal mouth +# 0.0 if np.isnan(p.x1) else normalize_angles(goal_angle), +# ( +# 0.0 +# if np.isnan(p.x1) +# else normalize_distance( +# np.linalg.norm(p.position - ball.position), +# max_distance=max_dist_to_player, +# ) +# ), # distance to the ball +# 0.0 if np.isnan(p.x1) else normalize_angles(ball_angle), +# 0.0 if np.isnan(p.x1) else team, +# # 1 if player is on same team but not in possession, 0.1 for all other players, 0.1 if the player is 'missing' +# ( +# 0.0 +# if np.isnan(p.x1) +# else 1.0 if potential_receiver else non_potential_receiver_node_value +# ), +# ] +# return player_node_features + +# def ball_features(ball): +# goal_angle = math.atan2( +# ball.y1 - goal_mouth_position[1], ball.x1 - goal_mouth_position[0] +# ) +# ball_node_features = [ +# normalize_coords(ball.x1, pitch_dimensions.x_dim.max), +# normalize_coords(ball.y1, pitch_dimensions.y_dim.max), +# unit_vector(ball.velocity)[0], +# unit_vector(ball.velocity)[1], +# round(normalize_speed(ball.speed, max_speed=max_ball_speed), 3), +# normalize_angles(np.arctan2(ball.velocity[1], ball.velocity[0])), +# normalize_distance( +# np.linalg.norm(ball.position - goal_mouth_position), +# max_distance=max_dist_to_goal, +# ), # distance to the goal mouth +# normalize_angles(goal_angle), +# # ball_angle 2x, ball_dist 2x, attacking_team 2x, ball carrier, potential receiver (all always 0 for ball) +# 0, +# 0, +# 0, +# 0, # , 0 +# ] + +# return np.asarray([ball_node_features]) + +# # loop over attacking players, grab ball_carrier, potential receiver and intended receiver +# ap_features = np.asarray( +# [ +# player_features(p, team=1, potential_receiver=(i != ball_carrier_idx)) +# for i, p in enumerate(attacking_players) +# ] +# ) + +# # loop over defending playres, we don't have ball_carrier, or receivers +# dp_features = np.asarray( +# [ +# player_features(p, team=defending_team_node_value) +# for i, p in enumerate(defending_players) +# ] +# ) + +# # compute ball features +# b_features = ball_features(ball) +# X = np.append(ap_features, dp_features, axis=0) + +# if include_ball_node: +# X = np.append(X, b_features, axis=0) + +# # convert np.NaN to 0 (zero) +# X = np.nan_to_num(X) +# return X diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py new file mode 100644 index 0000000..ad694f4 --- /dev/null +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -0,0 +1,400 @@ +import logging +import sys +from copy import deepcopy + +import pandas as pd + +import warnings + +from dataclasses import dataclass, field, asdict + +from typing import List, Union, Dict, Literal + +from kloppy.domain import ( + TrackingDataset, + Frame, + Orientation, + DatasetTransformer, + DatasetFlag, + SecondSpectrumCoordinateSystem, + MetricPitchDimensions, +) + +from spektral.data import Graph + +from .exceptions import ( + MissingLabelsError, + MissingDatasetError, + IncorrectDatasetTypeError, + KeyMismatchError, +) + +from .graph_settings_pl import GraphSettingsPL +from .dataset import KloppyDataset +from .features import ( + compute_node_features_pl, + compute_adjacency_matrix_pl, + compute_edge_features_pl, +) + +from ...utils import * + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +stdout_handler = logging.StreamHandler(sys.stdout) +logger.addHandler(stdout_handler) + + +@dataclass(repr=True) +class SoccerGraphConverterPL(DefaultGraphConverter): + """ + Converts our dataset TrackingDataset into an internal structure + + Attributes: + dataset (TrackingDataset): Kloppy TrackingDataset. + labels (dict): Dict with a key per frame_id, like so {frame_id: True/False/1/0} + graph_id (str, int): Set a single id for the whole Kloppy dataset. + graph_ids (dict): Frame level control over graph ids. + + The graph_ids will be used to assign each graph an identifier. This identifier allows us to split the CustomSpektralDataset such that + all graphs with the same id are either all in the test, train or validation set to avoid leakage. It is recommended to either set graph_id (int, str) as + a match_id, or pass a dictionary into 'graph_ids' with exactly the same keys as 'labels' for more granualar control over the graph ids. + The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}. + Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None. + + infer_ball_ownership (bool): + Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz. + Also infers ball_carrier within ball_carrier_threshold + infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame + ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0. + boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None. + non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players + """ + + dataset: KloppyDataset = None + + label_col: str = "label" + graph_id_col: str = "graph_id" + + chunk_size: int = 2_0000 + + infer_goalkeepers: bool = True + infer_ball_ownership: bool = True + boundary_correction: float = None + ball_carrier_treshold: float = 25.0 + + non_potential_receiver_node_value: float = 0.1 + + def __post_init__(self): + self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions + self.dataset = self.dataset.data + + self._sport_specific_checks() + self.settings = self._apply_settings() + self.dataset = self._apply_filters() + + def _apply_filters(self): + return self.dataset.with_columns( + pl.when( + (pl.col(self.settings._identifier_column) == self.settings.ball_id) + & (pl.col("v") > self.settings.max_ball_speed) + ) + .then(self.settings.max_ball_speed) + .when( + (pl.col(self.settings._identifier_column) != self.settings.ball_id) + & (pl.col("v") > self.settings.max_player_speed) + ) + .then(self.settings.max_player_speed) + .otherwise(pl.col("v")) + .alias("v") + ).with_columns( + pl.when( + (pl.col(self.settings._identifier_column) == self.settings.ball_id) + & (pl.col("a") > self.settings.max_ball_acceleration) + ) + .then(self.settings.max_ball_acceleration) + .when( + (pl.col(self.settings._identifier_column) != self.settings.ball_id) + & (pl.col("a") > self.settings.max_player_acceleration) + ) + .then(self.settings.max_player_acceleration) + .otherwise(pl.col("a")) + .alias("a") + ) + + def _apply_settings(self): + return GraphSettingsPL( + pitch_dimensions=self.pitch_dimensions, + ball_carrier_treshold=self.ball_carrier_treshold, + max_player_speed=self.max_player_speed, + max_ball_speed=self.max_ball_speed, + max_player_acceleration=self.max_player_acceleration, + max_ball_acceleration=self.max_ball_acceleration, + boundary_correction=self.boundary_correction, + self_loop_ball=self.self_loop_ball, + adjacency_matrix_connect_type=self.adjacency_matrix_connect_type, + adjacency_matrix_type=self.adjacency_matrix_type, + label_type=self.label_type, + infer_ball_ownership=self.infer_ball_ownership, + infer_goalkeepers=self.infer_goalkeepers, + defending_team_node_value=self.defending_team_node_value, + non_potential_receiver_node_value=self.non_potential_receiver_node_value, + random_seed=self.random_seed, + pad=self.pad, + verbose=self.verbose, + ) + + def _sport_specific_checks(self): + if not isinstance(self.label_col, str): + raise Exception("'label_col' should be of type string (str)") + + if not isinstance(self.graph_id_col, str): + raise Exception("'graph_id_col' should be of type string (str)") + + if not isinstance(self.chunk_size, int): + raise Exception("chunk_size should be of type integer (int)") + + if not self.label_col in self.dataset.columns and not self.prediction: + raise Exception( + "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on." + ) + + if not self.graph_id_col in self.dataset.columns: + raise Exception( + "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." + ) + + # Parameter Checks + if not isinstance(self.infer_goalkeepers, bool): + raise Exception("'infer_goalkeepers' should be of type boolean (bool)") + + if not isinstance(self.infer_ball_ownership, bool): + raise Exception("'infer_ball_ownership' should be of type boolean (bool)") + + if self.boundary_correction and not isinstance(self.boundary_correction, float): + raise Exception("'boundary_correction' should be of type float") + + if self.ball_carrier_treshold and not isinstance( + self.ball_carrier_treshold, float + ): + raise Exception("'ball_carrier_treshold' should be of type float") + + if self.non_potential_receiver_node_value and not isinstance( + self.non_potential_receiver_node_value, float + ): + raise Exception( + "'non_potential_receiver_node_value' should be of type float" + ) + + def _convert(self): + def __compute(args: List[pl.Series]) -> dict: + x = args[0].to_numpy() + y = args[1].to_numpy() + z = args[2].to_numpy() + v = args[3].to_numpy() + vx = args[4].to_numpy() + vy = args[5].to_numpy() + vz = args[6].to_numpy() + a = args[7].to_numpy() + ax = args[8].to_numpy() + ay = args[9].to_numpy() + az = args[10].to_numpy() + + team_id = args[6].to_numpy() + position_name = args[7].to_numpy() + ball_owning_team_id = args[8].to_numpy() + graph_id = args[9].to_numpy() + label = args[10].to_numpy() + + if not np.all(graph_id == graph_id[0]): + raise Exception( + "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..." + ) + + if not self.prediction and not np.all(label == label[0]): + raise Exception( + "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label." + ) + + ball_carrier_idx = get_ball_carrier_idx( + x=x, + y=y, + z=z, + team=team_id, + possession_team=ball_owning_team_id, + ball_id=self.settings.ball_id, + threshold=self.settings.ball_carrier_treshold, + ) + + adjacency_matrix = compute_adjacency_matrix_pl( + team=team_id, + possession_team=ball_owning_team_id, + settings=self.settings, + ball_carrier_idx=ball_carrier_idx, + ) + edge_features = compute_edge_features_pl( + adjacency_matrix=adjacency_matrix, + p3d=np.stack((x, y, z), axis=-1), + p2d=np.stack((x, y), axis=-1), + s=v, + velocity=np.stack((vx, vy), axis=-1), + team=team_id, + settings=self.settings, + ) + node_features = compute_node_features_pl( + x, + y, + s=v, + velocity=np.stack((vx, vy), axis=-1), + team=team_id, + possession_team=ball_owning_team_id, + is_gk=(position_name == self.settings.goalkeeper_id).astype(int), + settings=self.settings, + ) + return { + "e": pl.Series( + [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "x": pl.Series( + [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "a": pl.Series( + [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32)) + ), + "e_shape_0": edge_features.shape[0], + "e_shape_1": edge_features.shape[1], + "x_shape_0": node_features.shape[0], + "x_shape_1": node_features.shape[1], + "a_shape_0": adjacency_matrix.shape[0], + "a_shape_1": adjacency_matrix.shape[1], + self.graph_id_col: graph_id[0], + self.label_col: label[0], + } + + result_df = self.dataset.group_by( + ["game_id", "frame_id"], maintain_order=True + ).agg( + pl.map_groups( + exprs=[ + "x", + "y", + "z", + "v", + "vx", + "vy", + "vz", + "a", + "ax", + "ay", + "az", + "team_id", + "position_name", + "ball_owning_team_id", + self.graph_id_col, + self.label_col, + ], + function=__compute, + ).alias("result_dict") + ) + + graph_df = result_df.with_columns( + [ + pl.col("result_dict").struct.field("a").alias("a"), + pl.col("result_dict").struct.field("e").alias("e"), + pl.col("result_dict").struct.field("x").alias("x"), + pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"), + pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"), + pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"), + pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"), + pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"), + pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"), + pl.col("result_dict") + .struct.field(self.graph_id_col) + .alias(self.graph_id_col), + pl.col("result_dict") + .struct.field(self.label_col) + .alias(self.label_col), + ] + ) + + return graph_df.drop("result_dict") + + def to_graph_frames(self) -> List[dict]: + def __convert_to_graph_data_list(df): + lazy_df = df.lazy() + + graph_list = [] + + for chunk in lazy_df.collect().iter_slices(self.chunk_size): + chunk_graph_list = [ + { + "a": make_sparse( + flatten_to_reshaped_array( + arr=chunk["a"][i], + s0=chunk["a_shape_0"][i], + s1=chunk["a_shape_1"][i], + ) + ), + "x": flatten_to_reshaped_array( + arr=chunk["x"][i], + s0=chunk["x_shape_0"][i], + s1=chunk["x_shape_1"][i], + ), + "e": flatten_to_reshaped_array( + arr=chunk["e"][i], + s0=chunk["e_shape_0"][i], + s1=chunk["e_shape_1"][i], + ), + "y": np.asarray([chunk[self.label_col][i]]), + "id": chunk[self.graph_id_col][i], + } + for i in range(len(chunk["a"])) + ] + graph_list.extend(chunk_graph_list) + + return graph_list + + graph_df = self._convert() + self.graph_frames = __convert_to_graph_data_list(graph_df) + + return self.graph_frames + + def to_spektral_graphs(self) -> List[Graph]: + if not self.graph_frames: + self.to_graph_frames() + + return [ + Graph( + x=d["x"], + a=d["a"], + e=d["e"], + y=d["y"], + id=d["id"], + ) + for d in self.graph_frames + ] + + def to_pickle(self, file_path: str) -> None: + """ + We store the 'dict' version of the Graphs to pickle each graph is now a dict with keys x, a, e, and y + To use for training with Spektral feed the loaded pickle data to CustomDataset(data=pickled_data) + """ + if not file_path.endswith("pickle.gz"): + raise ValueError( + "Only compressed pickle files of type 'some_file_name.pickle.gz' are supported..." + ) + + if not self.graph_frames: + self.to_graph_frames() + + import pickle + import gzip + from pathlib import Path + + path = Path(file_path) + + directories = path.parent + directories.mkdir(parents=True, exist_ok=True) + + with gzip.open(file_path, "wb") as file: + pickle.dump(self.graph_frames, file) diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py new file mode 100644 index 0000000..0ef8dce --- /dev/null +++ b/unravel/soccer/graphs/graph_settings_pl.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass + +from ...utils import DefaultGraphSettings + +from dataclasses import dataclass, field +from kloppy.domain import Dimension, Unit, MetricPitchDimensions +from typing import Optional + + +@dataclass +class GraphSettingsPL(DefaultGraphSettings): + ball_id: str = "ball" + goalkeeper_id: str = "GK" + infer_goalkeepers: bool = True + boundary_correction: float = None + non_potential_receiver_node_value: float = 0.1 + ball_carrier_treshold: float = 25.0 + pitch_dimensions: MetricPitchDimensions = field( + init=False, repr=False, default_factory=MetricPitchDimensions + ) + _identifier_column: str = field(default="id", init=False) + + def __post_init__(self): + self._sport_specific_checks() + + @property + def pitch_dimensions(self) -> int: + return self._pitch_dimensions + + @pitch_dimensions.setter + def pitch_dimensions(self, pitch_dimensions: MetricPitchDimensions) -> None: + self._pitch_dimensions = pitch_dimensions + + def _sport_specific_checks(self): + if self.non_potential_receiver_node_value > 1: + self.non_potential_receiver_node_value = 1 + elif self.non_potential_receiver_node_value < 0: + self.non_potential_receiver_node_value = 0 diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py index d74c931..282fe61 100644 --- a/unravel/utils/features/utils.py +++ b/unravel/utils/features/utils.py @@ -69,6 +69,15 @@ def unit_vector(vector): return vector / norm +def unit_vectors(vectors): + magnitudes = np.linalg.norm(vectors, axis=1, keepdims=True) + + magnitudes[magnitudes == 0] = 1 + + unit_vectors = vectors / magnitudes + return unit_vectors + + def normalize_coords(value, max_value): return value / max_value @@ -172,3 +181,35 @@ def flatten_to_reshaped_array(arr, s0, s1, as_list=False): # Concatenate the arrays into one single array result_array = np.concatenate(flattened_list).reshape(s0, s1) return result_array if not as_list else result_array.tolist() + + +def distance_to_ball( + x: np.array, y: np.array, team: np.array, ball_id: str, z: np.array = None +): + if z is not None: + position = np.stack((x, y, z), axis=-1) + else: + position = np.stack((x, y), axis=-1) + if np.where(team == ball_id)[0].size >= 1: + ball_index = np.where(team == ball_id)[0] + ball_position = position[ball_index][0] + else: + if z is not None: + ball_position = np.asarray([0.0, 0.0, 0.0]) + else: + ball_position = np.asarray([0.0, 0.0]) + dist_to_ball = np.linalg.norm(position - ball_position, axis=1) + return position, ball_position, dist_to_ball + + +def get_ball_carrier_idx(x, y, z, team, possession_team, ball_id, threshold): + _, _, dist_to_ball = distance_to_ball(x=x, y=y, z=z, team=team, ball_id=ball_id) + + filtered_distances = np.where( + (team != possession_team) | (dist_to_ball <= threshold), np.inf, dist_to_ball + ) + + ball_carrier_idx = ( + np.argmin(filtered_distances) if np.isfinite(filtered_distances).any() else None + ) + return ball_carrier_idx diff --git a/unravel/utils/objects/__init__.py b/unravel/utils/objects/__init__.py index 3940548..4299b16 100644 --- a/unravel/utils/objects/__init__.py +++ b/unravel/utils/objects/__init__.py @@ -5,3 +5,4 @@ from .default_graph_frame import DefaultGraphFrame from .default_graph_settings import DefaultGraphSettings from .default_graph_converter import DefaultGraphConverter +from .default_dataset import DefaultDataset diff --git a/unravel/utils/objects/default_dataset.py b/unravel/utils/objects/default_dataset.py new file mode 100644 index 0000000..b31280e --- /dev/null +++ b/unravel/utils/objects/default_dataset.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass + + +@dataclass +class DefaultDataset: + def load(self): + raise NotImplementedError() + + def add_dummy_labels(self): + raise NotImplementedError() + + def add_graph_ids(self): + raise NotImplementedError() From 30a6775083d81f2d097a776a8e3b35cb40c7320e Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sat, 25 Jan 2025 11:39:18 +0100 Subject: [PATCH 02/10] polars, working --- examples/2_big_data_bowl_guide.ipynb | 2 +- tests/test_kloppy_polars.py | 217 +++++++++++ unravel/soccer/graphs/__init__.py | 6 +- unravel/soccer/graphs/dataset.py | 221 ++++++++++-- unravel/soccer/graphs/graph_converter_pl.py | 337 ++++++++++-------- unravel/soccer/graphs/graph_settings_pl.py | 3 +- .../utils/objects/default_graph_settings.py | 3 - 7 files changed, 615 insertions(+), 174 deletions(-) create mode 100644 tests/test_kloppy_polars.py diff --git a/examples/2_big_data_bowl_guide.ipynb b/examples/2_big_data_bowl_guide.ipynb index faf20ea..b6ed01c 100644 --- a/examples/2_big_data_bowl_guide.ipynb +++ b/examples/2_big_data_bowl_guide.ipynb @@ -218,7 +218,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py new file mode 100644 index 0000000..08e3e6a --- /dev/null +++ b/tests/test_kloppy_polars.py @@ -0,0 +1,217 @@ +from pathlib import Path +from unravel.soccer import ( + SoccerGraphConverterPolars, + KloppyPolarsDataset +) +from unravel.utils import ( + dummy_labels, + dummy_graph_ids, + CustomSpektralDataset, +) + +from kloppy import skillcorner +from kloppy.domain import Ground, TrackingDataset, Orientation +from typing import List, Dict + +from spektral.data import Graph + +import pytest + +import numpy as np + + +class TestKloppyPolarsData: + @pytest.fixture + def match_data(self, base_dir: Path) -> str: + return base_dir / "files" / "skillcorner_match_data.json" + + @pytest.fixture + def structured_data(self, base_dir: Path) -> str: + return base_dir / "files" / "skillcorner_structured_data.json.gz" + + @pytest.fixture() + def kloppy_dataset(self, match_data: str, structured_data: str) -> TrackingDataset: + return skillcorner.load( + raw_data=structured_data, + meta_data=match_data, + coordinates="tracab", + include_empty_frames=False, + limit=500, + ) + + @pytest.fixture() + def kloppy_polars_dataset(self, kloppy_dataset: TrackingDataset) -> KloppyPolarsDataset: + dataset = KloppyPolarsDataset( + kloppy_dataset=kloppy_dataset, + ball_carrier_threshold=25.0, + ) + dataset.load() + dataset.add_dummy_labels( + by=["game_id", "frame_id"] + ) + dataset.add_graph_ids( + by=["game_id", "frame_id"] + ) + return dataset + + @pytest.fixture() + def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars: + return SoccerGraphConverterPolars( + dataset=kloppy_polars_dataset, + chunk_size=2_0000, + non_potential_receiver_node_value=0.1, + max_player_speed=12.0, + max_player_acceleration=12.0, + max_ball_speed=13.5, + max_ball_acceleration=100, + self_loop_ball=True, + adjacency_matrix_connect_type="ball", + adjacency_matrix_type="split_by_team", + label_type="binary", + defending_team_node_value=0.0, + random_seed=False, + pad=True, + verbose=False, + ) + + @pytest.fixture() + def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars: + # TODO: + # check if + # - random_seed + # - padding needs to be per team_id otherwise stuff breaks + # all work as expected and/or should be moved to the KloppyPolarsDataset + + return SoccerGraphConverterPolars( + dataset=kloppy_polars_dataset, + chunk_size=2_0000, + non_potential_receiver_node_value=0.1, + max_player_speed=12.0, + max_player_acceleration=12.0, + max_ball_speed=13.5, + max_ball_acceleration=100, + self_loop_ball=True, + adjacency_matrix_connect_type="ball", + adjacency_matrix_type="split_by_team", + label_type="binary", + defending_team_node_value=0.0, + random_seed=False, + pad=False, + verbose=False, + ) + + # @pytest.fixture() + # def gnnc_padding_random(self, dataset: TrackingDataset) -> SoccerGraphConverter: + # return SoccerGraphConverter( + # dataset=dataset, + # labels=dummy_labels(dataset), + # # settings + # ball_carrier_treshold=25.0, + # max_player_speed=12.0, + # max_ball_speed=28.0, + # boundary_correction=None, + # self_loop_ball=False, + # adjacency_matrix_connect_type="ball", + # adjacency_matrix_type="split_by_team", + # label_type="binary", + # defending_team_node_value=0.0, + # non_potential_receiver_node_value=0.1, + # infer_ball_ownership=True, + # infer_goalkeepers=True, + # random_seed=42, + # pad=True, + # verbose=False, + # ) + def test_padding(self, spc_padding: SoccerGraphConverterPolars): + """ + Test navigating (next/prev) through events + """ + spektral_graphs = spc_padding.to_spektral_graphs() + + assert 1 == 1 + + data = spektral_graphs + assert len(data) == 384 + assert isinstance(data[0], Graph) + + def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPolars): + """ + Test navigating (next/prev) through events + """ + spektral_graphs = soccer_polars_converter.to_spektral_graphs() + + assert 1 == 1 + + data = spektral_graphs + assert len(data) == 489 + assert isinstance(data[0], Graph) + + x = data[0].x + assert x.shape == (10, 12) + assert 0.31373436337428573 == pytest.approx(x[0, 0], abs=1e-5) + assert 0.06765375015355701 == pytest.approx(x[0, 4], abs=1e-5) + assert 0.47729475229688306 == pytest.approx(x[8, 2], abs=1e-5) + + e = data[0].e + assert e.shape == (60, 6) + assert 0.0 == pytest.approx(e[0, 0], abs=1e-5) + assert 0.5 == pytest.approx(e[0, 4], abs=1e-5) + assert 0.579979482018554 == pytest.approx(e[8, 2], abs=1e-5) + + a = data[0].a + assert a.shape == (10, 10) + assert 1.0 == pytest.approx(a[0, 0], abs=1e-5) + assert 1.0 == pytest.approx(a[0, 4], abs=1e-5) + assert 0.0 == pytest.approx(a[8, 2], abs=1e-5) + + dataset = CustomSpektralDataset(graphs=spektral_graphs) + N, F, S, n_out, n = dataset.dimensions() + assert N == 20 + assert F == 12 + assert S == 6 + assert n_out == 1 + assert n == 489 + + train, test, val = dataset.split_test_train_validation( + split_train=4, + split_test=1, + split_validation=1, + by_graph_id=True, + random_seed=42, + ) + assert train.n_graphs == 326 + assert test.n_graphs == 81 + assert val.n_graphs == 82 + + train, test, val = dataset.split_test_train_validation( + split_train=4, + split_test=1, + split_validation=1, + by_graph_id=False, + random_seed=42, + ) + assert train.n_graphs == 326 + assert test.n_graphs == 81 + assert val.n_graphs == 82 + + train, test = dataset.split_test_train( + split_train=4, split_test=1, by_graph_id=False, random_seed=42 + ) + assert train.n_graphs == 391 + assert test.n_graphs == 98 + + train, test = dataset.split_test_train( + split_train=4, split_test=5, by_graph_id=False, random_seed=42 + ) + assert train.n_graphs == 217 + assert test.n_graphs == 272 + + with pytest.raises( + NotImplementedError, + match="Make sure split_train > split_test >= split_validation, other behaviour is not supported when by_graph_id is True...", + ): + dataset.split_test_train( + split_train=4, split_test=5, by_graph_id=True, random_seed=42 + ) + + diff --git a/unravel/soccer/graphs/__init__.py b/unravel/soccer/graphs/__init__.py index bd44fac..2991890 100644 --- a/unravel/soccer/graphs/__init__.py +++ b/unravel/soccer/graphs/__init__.py @@ -1,9 +1,9 @@ from .graph_converter import SoccerGraphConverter -from .graph_converter_pl import SoccerGraphConverterPL +from .graph_converter_pl import SoccerGraphConverterPolars from .graph_settings import SoccerGraphSettings -from .graph_settings_pl import GraphSettingsPL +from .graph_settings_pl import GraphSettingsPolars from .graph_frame import GraphFrame from .exceptions import * from .features import * -from .dataset import KloppyDataset +from .dataset import KloppyPolarsDataset diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py index 459b1d6..b9c4243 100644 --- a/unravel/soccer/graphs/dataset.py +++ b/unravel/soccer/graphs/dataset.py @@ -16,8 +16,8 @@ import polars as pl -DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 2} -DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 2} +DEFAULT_PLAYER_SMOOTHING_PARAMS = {"window_length": 7, "polyorder": 1} +DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 1} @dataclass @@ -28,19 +28,27 @@ class SoccerObject: @dataclass -class KloppyDataset(DefaultDataset): +class KloppyPolarsDataset(DefaultDataset): kloppy_dataset: TrackingDataset + ball_carrier_threshold: float = None _identifier_column: str = field(default="id", init=False) + _graph_id_column: str = field(default="graph_id") + _label_column: str = field(default="label") _partition_by: List[str] = field( default_factory=lambda: ["id", "period_id"], init=False ) - + _infer_ball_owning_team_id: bool = field(default=False, init=False) + _overwrite_orientation: bool = field(default=False, init=False) + _infer_goalkeepers: bool = field(default=False, init=False) + def __transform_orientation(self): if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM: + self._overwrite_orientation = True + # In this package attacking is always left to right, so if this is not giving in Kloppy, overwrite it to_orientation = Orientation.STATIC_HOME_AWAY else: to_orientation = Orientation.BALL_OWNING_TEAM - + self.kloppy_dataset = DatasetTransformer.transform_dataset( dataset=self.kloppy_dataset, to_orientation=to_orientation, @@ -52,18 +60,35 @@ def __transform_orientation(self): return self.kloppy_dataset def __get_objects(self): + def __artificial_game_id() -> str: + from uuid import uuid4 + return str(uuid4()) + home_team, away_team = self.kloppy_dataset.metadata.teams - - home_players = [ - SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) - for p in home_team.players - ] - away_players = [ - SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) - for p in away_team.players - ] - ball_object = SoccerObject("ball", None, "ball") + + if all(item is None for item in [p.starting_position for p in home_team.players]): + self._infer_goalkeepers = True + home_players = [ + SoccerObject(p.player_id, p.team.team_id, None) + for p in home_team.players + ] + away_players = [ + SoccerObject(p.player_id, p.team.team_id, None) + for p in away_team.players + ] + else: + home_players = [ + SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) + for p in home_team.players + ] + away_players = [ + SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) + for p in away_team.players + ] + ball_object = SoccerObject("ball", "ball", "ball") game_id = self.kloppy_dataset.metadata.game_id + if game_id is None: + game_id = __artificial_game_id() return (home_players, away_players, ball_object, game_id) def __unpivot(self, object, coordinate): @@ -271,6 +296,10 @@ def __melt( continue melted_df = self.__unpivot(object, coordinate) + + if object.id == "ball" and coordinate == "z": + if melted_df[coordinate].is_null().all(): + melted_df = melted_df.with_columns([pl.lit(0.0).alias("z")]) if k == 0: melted_object_dfs.append(melted_df) else: @@ -288,17 +317,145 @@ def __melt( ) melted_dfs.append(object_df) - + df = pl.concat(melted_dfs, how="vertical") df = df.with_columns([pl.lit(game_id).alias("game_id")]) df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True) return df + + def __get_inferred_ball_owning_team_id(self, df: pl.DataFrame): + non_ball_owning_team = ( + df.filter(pl.col("ball_owning_team_id").is_null()) + ) + ball_owning_team = ( + df.filter(~pl.col("ball_owning_team_id").is_null()) + ) + + ball = ( + non_ball_owning_team.filter(pl.col('team_id') == "ball") + ) + players = ( + non_ball_owning_team.filter(pl.col('team_id') != "ball") + ) + result = ( + players.drop('ball_owning_team_id') + .join( + ball.select( + ['game_id', 'period_id', 'frame_id', + pl.col('x').alias('ball_x'), + pl.col('y').alias('ball_y'), + pl.col('z').alias('ball_z')] + ), + on=['game_id', 'period_id', 'frame_id'], + how='left' + ) + .with_columns([ + ((pl.col('x') - pl.col('ball_x'))**2 + + (pl.col('y') - pl.col('ball_y'))**2 + + (pl.col('z') - pl.col('ball_z'))**2 + ).sqrt().alias('distance') + ]) + .group_by(['game_id', 'period_id', 'frame_id']) + .agg([ + pl.when(pl.col('distance').min() < self.ball_carrier_threshold) + .then(pl.col('team_id').filter(pl.col('distance') == pl.col('distance').min()).first()) + .otherwise(None) + .alias('ball_owning_team_id'), + pl.all().sort_by('distance').first() + ]) + ) + non_ball_owning_team = ( + non_ball_owning_team.drop('ball_owning_team_id') + .join( + result.select(['game_id', 'period_id', 'frame_id', 'ball_owning_team_id']), + on=['game_id', 'period_id', 'frame_id'], + how='left' + ) + .filter( + ~pl.col("ball_owning_team_id").is_null() + ) + .with_columns([ + pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id']) + ]) + .select(ball_owning_team.columns) + ) + ball_owning_team = ( + ball_owning_team + .with_columns([ + pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id']) + ]) + ) + + new_df = ( + pl.concat([ + ball_owning_team, + non_ball_owning_team + ], how="vertical") + .sort(['game_id', 'period_id', 'frame_id', 'team_id']) + ) + return new_df + + def __get_inferred_goalkeepers(self, df: pl.DataFrame): + goal_x = self.pitch_dimensions.pitch_length / 2 + goal_y = 0 + + df_with_distances = ( + df.filter(pl.col('team_id') != "ball") + .with_columns([ + ((pl.col('x') - (-goal_x))**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_left'), + ((pl.col('x') - goal_x)**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_right') + ]) + ) + result = ( + df_with_distances + .with_columns([ + pl.col('dist_left').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_left'), + pl.col('dist_right').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_right') + ]) + .with_columns([ + pl.when(pl.col('team_id') == pl.col('ball_owning_team_id')) + .then( + pl.when(pl.col('dist_left') == pl.col('min_dist_left')) + .then(pl.lit('GK')) + .otherwise(None) + ) + .otherwise( + pl.when(pl.col('dist_right') == pl.col('min_dist_right')) + .then(pl.lit('GK')) + .otherwise(None) + ) + .alias('position_name') + ]) + .drop(['min_dist_left', 'min_dist_right', 'dist_left', 'dist_right']) + ) + ball_rows = df.filter(pl.col('team_id') == "ball") + non_ball_rows = result + + return ( + pl.concat([ball_rows, non_ball_rows], how="vertical") + .sort(['game_id', 'period_id', 'frame_id', 'team_id']) + ) + + def __fix_orientation_to_ball_owning(self, df: pl.DataFrame, home_team_id: Union[str, int]): + # When _overwrite_orientation is True, it means the orientation is "STATIC_HOME_AWAY" + # This means that when away is the attacking team we can flip all coordinates by -1.0 + + flip_columns = ['x', 'y', 'vx', 'vy', 'ax', 'ay'] + + return df.with_columns([ + pl.when(pl.col('ball_owning_team_id').cast(str) != str(home_team_id)) + .then(pl.col(flip_columns) * -1) + .otherwise(pl.col(flip_columns)) + ]) def load( self, player_smoothing_params: Union[dict, None] = DEFAULT_PLAYER_SMOOTHING_PARAMS, ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS, ): + if self.kloppy_dataset.metadata.orientation == Orientation.NOT_SET: + raise ValueError("Data sources with an undefined orientation can not be used inside the 'unravelsports' package...") + self.kloppy_dataset = self.__transform_orientation() self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions @@ -309,22 +466,40 @@ def load( df = self.__melt( self._home_players, self._away_players, self._ball_object, self._game_id ) + df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params) df = self.__add_acceleration(df) - self.data = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"]) - + df = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"]) + + df = df.filter( + ~(pl.col('x').is_null() & pl.col('y').is_null()) + ) + + if df['ball_owning_team_id'].is_null().all() and self.ball_carrier_threshold: + raise ValueError("This dataset requires us to infer the ball_owning_team_id, please specifiy a ball_carrier_threshold (float) to do so.") + + if self.ball_carrier_threshold is not None: + df = self.__get_inferred_ball_owning_team_id(df) + + if self._overwrite_orientation: + home_team, _ = self.kloppy_dataset.metadata.teams + df = self.__fix_orientation_to_ball_owning(df, home_team_id=home_team.team_id) + + if self._infer_goalkeepers: + df = self.__get_inferred_goalkeepers(df) + + self.data = df return self.data, self.pitch_dimensions def add_dummy_labels( self, - by: List[str] = ["game_id", "frame_id"], - column_name: str = "label", + by: List[str] = ["game_id", "frame_id"] ) -> pl.DataFrame: - self.data = add_dummy_label_column(self.data, by, column_name) + self.data = add_dummy_label_column(self.data, by, self._label_column) return self.data def add_graph_ids( - self, by: List[str] = ["game_id", "period_id"], column_name: str = "graph_id" + self, by: List[str] = ["game_id", "period_id"] ) -> pl.DataFrame: - self.data = add_graph_id_column(self.data, by, column_name) + self.data = add_graph_id_column(self.data, by, self._graph_id_column) return self.data diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index ad694f4..252ff03 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -8,7 +8,7 @@ from dataclasses import dataclass, field, asdict -from typing import List, Union, Dict, Literal +from typing import List, Union, Dict, Literal, Any from kloppy.domain import ( TrackingDataset, @@ -29,8 +29,8 @@ KeyMismatchError, ) -from .graph_settings_pl import GraphSettingsPL -from .dataset import KloppyDataset +from .graph_settings_pl import GraphSettingsPolars +from .dataset import KloppyPolarsDataset from .features import ( compute_node_features_pl, compute_adjacency_matrix_pl, @@ -46,7 +46,7 @@ @dataclass(repr=True) -class SoccerGraphConverterPL(DefaultGraphConverter): +class SoccerGraphConverterPolars(DefaultGraphConverter): """ Converts our dataset TrackingDataset into an internal structure @@ -62,36 +62,124 @@ class SoccerGraphConverterPL(DefaultGraphConverter): The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}. Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None. - infer_ball_ownership (bool): - Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz. - Also infers ball_carrier within ball_carrier_threshold - infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0. - boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None. non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players """ - dataset: KloppyDataset = None - - label_col: str = "label" - graph_id_col: str = "graph_id" + dataset: KloppyPolarsDataset = None chunk_size: int = 2_0000 - - infer_goalkeepers: bool = True - infer_ball_ownership: bool = True - boundary_correction: float = None - ball_carrier_treshold: float = 25.0 - non_potential_receiver_node_value: float = 0.1 def __post_init__(self): self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions + self.label_col = self.dataset._label_column + self.graph_id_col = self.dataset._graph_id_column + + self.ball_carrier_threshold = self.dataset.ball_carrier_threshold self.dataset = self.dataset.data self._sport_specific_checks() self.settings = self._apply_settings() self.dataset = self._apply_filters() + + if self.pad: + self.dataset = self._apply_padding(df=self.dataset) + + @staticmethod + def _apply_padding(df: pl.DataFrame) -> pl.DataFrame: + keep_columns = [ + 'timestamp', + 'ball_state', + 'position_name', + 'label', + 'graph_id' + ] + empty_columns = [ + 'id', 'x', 'y', 'z', 'vx', 'vy', + 'vz', 'v', 'ax', 'ay', 'az', 'a' + ] + group_by_columns = ['game_id', 'period_id', 'frame_id', 'team_id', 'ball_owning_team_id'] + + counts = ( + df.group_by(group_by_columns) + .agg( + pl.len().alias('count'), + *[pl.first(col).alias(col) for col in keep_columns] + ) + ) + + counts = counts.with_columns([ + pl.when(pl.col('team_id') == "ball") + .then(1) + .when(pl.col('team_id') == pl.col('ball_owning_team_id')) + .then(11) + .otherwise(11) + .alias('target_length') + ]) + + groups_to_pad = ( + counts + .filter(pl.col('count') < pl.col('target_length')) + .with_columns( + (pl.col('target_length') - pl.col('count')).alias('repeats') + ) + ) + + if len(groups_to_pad) == 0: + return df + + padding_rows = [] + for row in groups_to_pad.iter_rows(named=True): + base_row = {col: row[col] for col in keep_columns + group_by_columns} + padding_rows.extend([base_row] * row['repeats']) + + padding_df = pl.DataFrame(padding_rows) + + schema = df.schema + padding_df = padding_df.with_columns([ + pl.lit(0.0 if schema[col] != pl.String else "None").cast(schema[col]).alias(col) + for col in empty_columns + ]) + + padding_df = padding_df.select(df.columns) + + result = pl.concat([df, padding_df], how='vertical') + + total_frames = ( + result.select(['game_id', 'period_id', 'frame_id']) + .unique() + .height + ) + + frame_completeness = ( + result.group_by(['game_id', 'period_id', 'frame_id']) + .agg([ + (pl.col('team_id').eq("ball").sum() == 1).alias('has_ball'), + (pl.col('team_id').eq(pl.col('ball_owning_team_id')).sum() == 11).alias('has_owning_team'), + ((~pl.col('team_id').eq("ball") & ~pl.col('team_id').eq(pl.col('ball_owning_team_id'))).sum() == 11).alias('has_other_team') + ]) + .filter( + pl.col('has_ball') & pl.col('has_owning_team') & pl.col('has_other_team') + ) + ) + + complete_frames = frame_completeness.height + + dropped_frames = total_frames - complete_frames + if dropped_frames > 0: + import warnings + warnings.warn( + f"""Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball. + This operation dropped {dropped_frames} incomplete frames out of {total_frames} total frames ({(dropped_frames/total_frames)*100:.2f}%) + """ + ) + + return result.join( + frame_completeness, + on=['game_id', 'period_id', 'frame_id'], + how='inner' + ) def _apply_filters(self): return self.dataset.with_columns( @@ -123,20 +211,17 @@ def _apply_filters(self): ) def _apply_settings(self): - return GraphSettingsPL( + return GraphSettingsPolars( pitch_dimensions=self.pitch_dimensions, - ball_carrier_treshold=self.ball_carrier_treshold, + ball_carrier_treshold=self.ball_carrier_threshold, max_player_speed=self.max_player_speed, max_ball_speed=self.max_ball_speed, max_player_acceleration=self.max_player_acceleration, max_ball_acceleration=self.max_ball_acceleration, - boundary_correction=self.boundary_correction, self_loop_ball=self.self_loop_ball, adjacency_matrix_connect_type=self.adjacency_matrix_connect_type, adjacency_matrix_type=self.adjacency_matrix_type, label_type=self.label_type, - infer_ball_ownership=self.infer_ball_ownership, - infer_goalkeepers=self.infer_goalkeepers, defending_team_node_value=self.defending_team_node_value, non_potential_receiver_node_value=self.non_potential_receiver_node_value, random_seed=self.random_seed, @@ -164,20 +249,10 @@ def _sport_specific_checks(self): "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." ) - # Parameter Checks - if not isinstance(self.infer_goalkeepers, bool): - raise Exception("'infer_goalkeepers' should be of type boolean (bool)") - - if not isinstance(self.infer_ball_ownership, bool): - raise Exception("'infer_ball_ownership' should be of type boolean (bool)") - - if self.boundary_correction and not isinstance(self.boundary_correction, float): - raise Exception("'boundary_correction' should be of type float") - - if self.ball_carrier_treshold and not isinstance( - self.ball_carrier_treshold, float + if self.ball_carrier_threshold and not isinstance( + self.ball_carrier_threshold, float ): - raise Exception("'ball_carrier_treshold' should be of type float") + raise Exception("'ball_carrier_threshold' should be of type float") if self.non_potential_receiver_node_value and not isinstance( self.non_potential_receiver_node_value, float @@ -185,115 +260,91 @@ def _sport_specific_checks(self): raise Exception( "'non_potential_receiver_node_value' should be of type float" ) - - def _convert(self): - def __compute(args: List[pl.Series]) -> dict: - x = args[0].to_numpy() - y = args[1].to_numpy() - z = args[2].to_numpy() - v = args[3].to_numpy() - vx = args[4].to_numpy() - vy = args[5].to_numpy() - vz = args[6].to_numpy() - a = args[7].to_numpy() - ax = args[8].to_numpy() - ay = args[9].to_numpy() - az = args[10].to_numpy() - - team_id = args[6].to_numpy() - position_name = args[7].to_numpy() - ball_owning_team_id = args[8].to_numpy() - graph_id = args[9].to_numpy() - label = args[10].to_numpy() - - if not np.all(graph_id == graph_id[0]): - raise Exception( - "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..." - ) - - if not self.prediction and not np.all(label == label[0]): - raise Exception( - "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label." - ) - - ball_carrier_idx = get_ball_carrier_idx( - x=x, - y=y, - z=z, - team=team_id, - possession_team=ball_owning_team_id, - ball_id=self.settings.ball_id, - threshold=self.settings.ball_carrier_treshold, + + @property + def __exprs_variables(self): + return [ + "x", "y", "z", + "v", "vx", "vy", "vz", + "a", "ax", "ay", "az", + "team_id", "position_name", "ball_owning_team_id", + self.graph_id_col, + self.label_col, + ] + + def __compute(self, args: List[pl.Series]) -> dict: + d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} + + if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]): + raise Exception( + "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) - adjacency_matrix = compute_adjacency_matrix_pl( - team=team_id, - possession_team=ball_owning_team_id, - settings=self.settings, - ball_carrier_idx=ball_carrier_idx, - ) - edge_features = compute_edge_features_pl( - adjacency_matrix=adjacency_matrix, - p3d=np.stack((x, y, z), axis=-1), - p2d=np.stack((x, y), axis=-1), - s=v, - velocity=np.stack((vx, vy), axis=-1), - team=team_id, - settings=self.settings, - ) - node_features = compute_node_features_pl( - x, - y, - s=v, - velocity=np.stack((vx, vy), axis=-1), - team=team_id, - possession_team=ball_owning_team_id, - is_gk=(position_name == self.settings.goalkeeper_id).astype(int), - settings=self.settings, + if not self.prediction and not np.all(d[self.label_col] == d[self.label_col][0]): + raise Exception( + """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, + make sure this is not the case. Each group can only have 1 label.""" ) - return { - "e": pl.Series( - [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) - ), - "x": pl.Series( - [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) - ), - "a": pl.Series( - [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32)) - ), - "e_shape_0": edge_features.shape[0], - "e_shape_1": edge_features.shape[1], - "x_shape_0": node_features.shape[0], - "x_shape_1": node_features.shape[1], - "a_shape_0": adjacency_matrix.shape[0], - "a_shape_1": adjacency_matrix.shape[1], - self.graph_id_col: graph_id[0], - self.label_col: label[0], - } - + + ball_carrier_idx = get_ball_carrier_idx( + x=d['x'], y=d['y'], z=d['z'], + team=d['team_id'], + possession_team=d['ball_owning_team_id'], + ball_id=self.settings.ball_id, + threshold=self.settings.ball_carrier_treshold, + ) + adjacency_matrix = compute_adjacency_matrix_pl( + team=d['team_id'], + possession_team=d['ball_owning_team_id'], + settings=self.settings, + ball_carrier_idx=ball_carrier_idx, + ) + edge_features = compute_edge_features_pl( + adjacency_matrix=adjacency_matrix, + p3d=np.stack((d['x'], d['y'], d['z']), axis=-1), + p2d=np.stack((d['x'], d['y']), axis=-1), + s=d['v'], + velocity=np.stack((d['vx'], d['vy']), axis=-1), + team=d['team_id'], + settings=self.settings, + ) + node_features = compute_node_features_pl( + d['x'], + d['y'], + s=d['v'], + velocity=np.stack((d['vx'], d['vy']), axis=-1), + team=d['team_id'], + possession_team=d['ball_owning_team_id'], + is_gk=(d['position_name'] == self.settings.goalkeeper_id).astype(int), + settings=self.settings, + ) + return { + "e": pl.Series( + [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "x": pl.Series( + [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "a": pl.Series( + [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32)) + ), + "e_shape_0": edge_features.shape[0], + "e_shape_1": edge_features.shape[1], + "x_shape_0": node_features.shape[0], + "x_shape_1": node_features.shape[1], + "a_shape_0": adjacency_matrix.shape[0], + "a_shape_1": adjacency_matrix.shape[1], + self.graph_id_col: d[self.graph_id_col][0], + self.label_col: d[self.label_col][0], + } + + def _convert(self): result_df = self.dataset.group_by( ["game_id", "frame_id"], maintain_order=True ).agg( pl.map_groups( - exprs=[ - "x", - "y", - "z", - "v", - "vx", - "vy", - "vz", - "a", - "ax", - "ay", - "az", - "team_id", - "position_name", - "ball_owning_team_id", - self.graph_id_col, - self.label_col, - ], - function=__compute, + exprs=self.__exprs_variables, + function=self.__compute, ).alias("result_dict") ) @@ -318,6 +369,8 @@ def __compute(args: List[pl.Series]) -> dict: ) return graph_df.drop("result_dict") + + def to_graph_frames(self) -> List[dict]: def __convert_to_graph_data_list(df): @@ -353,10 +406,10 @@ def __convert_to_graph_data_list(df): graph_list.extend(chunk_graph_list) return graph_list - + graph_df = self._convert() - self.graph_frames = __convert_to_graph_data_list(graph_df) - + self.graph_frames = self.__convert_to_graph_data_list(graph_df) + return self.graph_frames def to_spektral_graphs(self) -> List[Graph]: diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py index 0ef8dce..4e934a9 100644 --- a/unravel/soccer/graphs/graph_settings_pl.py +++ b/unravel/soccer/graphs/graph_settings_pl.py @@ -8,10 +8,9 @@ @dataclass -class GraphSettingsPL(DefaultGraphSettings): +class GraphSettingsPolars(DefaultGraphSettings): ball_id: str = "ball" goalkeeper_id: str = "GK" - infer_goalkeepers: bool = True boundary_correction: float = None non_potential_receiver_node_value: float = 0.1 ball_carrier_treshold: float = 25.0 diff --git a/unravel/utils/objects/default_graph_settings.py b/unravel/utils/objects/default_graph_settings.py index fd67519..d77b5c1 100644 --- a/unravel/utils/objects/default_graph_settings.py +++ b/unravel/utils/objects/default_graph_settings.py @@ -18,9 +18,6 @@ class DefaultGraphSettings: Attributes: infer_ball_ownership (bool): Infers 'attacking_team' if no 'ball_owning_team' (Kloppy) or 'attacking_team' (List[Dict]) is provided, by finding player closest to ball using ball xyz. - Also infers ball_carrier within ball_carrier_threshold - infer_goalkeepers (bool): set True if no GK label is provider, set False for incomplete (broadcast tracking) data that might not have a GK in every frame - ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0. max_player_speed (float): The maximum speed of a player in meters per second. Defaults to 12.0. max_ball_speed (float): The maximum speed of the ball in meters per second. Defaults to 28.0. boundary_correction (float): A correction factor for boundary calculations, used to correct out of bounds as a percentages (Used as 1+boundary_correction, ie 0.05). Defaults to None. From 37ad16c25603bad8e17debff7c160d94e1eac604 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 08:40:45 +0100 Subject: [PATCH 03/10] polars implementation --- tests/test_kloppy_polars.py | 64 ++- unravel/soccer/graphs/dataset.py | 513 +++++++++++------- .../graphs/features/adjacency_matrix_pl.py | 10 +- .../soccer/graphs/features/node_features.py | 2 +- .../graphs/features/node_features_pl.py | 167 +----- unravel/soccer/graphs/graph_converter_pl.py | 300 +++++----- unravel/soccer/graphs/graph_settings_pl.py | 5 +- unravel/utils/features/utils.py | 2 +- 8 files changed, 528 insertions(+), 535 deletions(-) diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py index 08e3e6a..4d70d2a 100644 --- a/tests/test_kloppy_polars.py +++ b/tests/test_kloppy_polars.py @@ -1,8 +1,5 @@ from pathlib import Path -from unravel.soccer import ( - SoccerGraphConverterPolars, - KloppyPolarsDataset -) +from unravel.soccer import SoccerGraphConverterPolars, KloppyPolarsDataset from unravel.utils import ( dummy_labels, dummy_graph_ids, @@ -38,24 +35,24 @@ def kloppy_dataset(self, match_data: str, structured_data: str) -> TrackingDatas include_empty_frames=False, limit=500, ) - + @pytest.fixture() - def kloppy_polars_dataset(self, kloppy_dataset: TrackingDataset) -> KloppyPolarsDataset: + def kloppy_polars_dataset( + self, kloppy_dataset: TrackingDataset + ) -> KloppyPolarsDataset: dataset = KloppyPolarsDataset( kloppy_dataset=kloppy_dataset, ball_carrier_threshold=25.0, ) dataset.load() - dataset.add_dummy_labels( - by=["game_id", "frame_id"] - ) - dataset.add_graph_ids( - by=["game_id", "frame_id"] - ) + dataset.add_dummy_labels(by=["game_id", "frame_id"]) + dataset.add_graph_ids(by=["game_id", "frame_id"]) return dataset - + @pytest.fixture() - def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars: + def spc_padding( + self, kloppy_polars_dataset: KloppyPolarsDataset + ) -> SoccerGraphConverterPolars: return SoccerGraphConverterPolars( dataset=kloppy_polars_dataset, chunk_size=2_0000, @@ -75,13 +72,15 @@ def spc_padding(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraph ) @pytest.fixture() - def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) -> SoccerGraphConverterPolars: - # TODO: - # check if + def soccer_polars_converter( + self, kloppy_polars_dataset: KloppyPolarsDataset + ) -> SoccerGraphConverterPolars: + # TODO: + # check if # - random_seed # - padding needs to be per team_id otherwise stuff breaks # all work as expected and/or should be moved to the KloppyPolarsDataset - + return SoccerGraphConverterPolars( dataset=kloppy_polars_dataset, chunk_size=2_0000, @@ -123,9 +122,6 @@ def soccer_polars_converter(self, kloppy_polars_dataset: KloppyPolarsDataset) -> # verbose=False, # ) def test_padding(self, spc_padding: SoccerGraphConverterPolars): - """ - Test navigating (next/prev) through events - """ spektral_graphs = spc_padding.to_spektral_graphs() assert 1 == 1 @@ -134,32 +130,36 @@ def test_padding(self, spc_padding: SoccerGraphConverterPolars): assert len(data) == 384 assert isinstance(data[0], Graph) - def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPolars): + def test_to_spektral_graph( + self, soccer_polars_converter: SoccerGraphConverterPolars + ): """ Test navigating (next/prev) through events """ spektral_graphs = soccer_polars_converter.to_spektral_graphs() assert 1 == 1 - + data = spektral_graphs + assert data[0].id == "2417-1529" assert len(data) == 489 assert isinstance(data[0], Graph) x = data[0].x - assert x.shape == (10, 12) - assert 0.31373436337428573 == pytest.approx(x[0, 0], abs=1e-5) - assert 0.06765375015355701 == pytest.approx(x[0, 4], abs=1e-5) - assert 0.47729475229688306 == pytest.approx(x[8, 2], abs=1e-5) + n_players = x.shape[0] + assert x.shape == (n_players, 15) + assert 0.4524340998288571 == pytest.approx(x[0, 0], abs=1e-5) + assert 0.9948105277764999 == pytest.approx(x[0, 4], abs=1e-5) + assert 0.2941671698429814 == pytest.approx(x[8, 2], abs=1e-5) e = data[0].e - assert e.shape == (60, 6) + assert e.shape == (129, 6) assert 0.0 == pytest.approx(e[0, 0], abs=1e-5) assert 0.5 == pytest.approx(e[0, 4], abs=1e-5) - assert 0.579979482018554 == pytest.approx(e[8, 2], abs=1e-5) + assert 0.7140882876637022 == pytest.approx(e[8, 2], abs=1e-5) a = data[0].a - assert a.shape == (10, 10) + assert a.shape == (n_players, n_players) assert 1.0 == pytest.approx(a[0, 0], abs=1e-5) assert 1.0 == pytest.approx(a[0, 4], abs=1e-5) assert 0.0 == pytest.approx(a[8, 2], abs=1e-5) @@ -167,7 +167,7 @@ def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPo dataset = CustomSpektralDataset(graphs=spektral_graphs) N, F, S, n_out, n = dataset.dimensions() assert N == 20 - assert F == 12 + assert F == 15 assert S == 6 assert n_out == 1 assert n == 489 @@ -213,5 +213,3 @@ def test_to_spektral_graph(self, soccer_polars_converter: SoccerGraphConverterPo dataset.split_test_train( split_train=4, split_test=5, by_graph_id=True, random_seed=42 ) - - diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py index b9c4243..e9d66b5 100644 --- a/unravel/soccer/graphs/dataset.py +++ b/unravel/soccer/graphs/dataset.py @@ -20,6 +20,44 @@ DEFAULT_BALL_SMOOTHING_PARAMS = {"window_length": 3, "polyorder": 1} +class Constant: + BALL = "ball" + + +class Column: + BALL_OWNING_TEAM_ID = "ball_owning_team_id" + BALL_OWNING_PLAYER_ID = "ball_owning_player_id" + IS_BALL_CARRIER = "is_ball_carrier" + PERIOD_ID = "period_id" + TIMESTAMP = "timestamp" + BALL_STATE = "ball_state" + FRAME_ID = "frame_id" + GAME_ID = "game_id" + TEAM_ID = "team_id" + OBJECT_ID = "id" + POSITION_NAME = "position_name" + + X = "x" + Y = "y" + Z = "z" + + V = "v" + VX = "vx" + VY = "vy" + VZ = "vz" + + A = "a" + AX = "ax" + AY = "ay" + AZ = "az" + + +class Group: + BY_FRAME = [Column.GAME_ID, Column.PERIOD_ID, Column.FRAME_ID] + BY_FRAME_TEAM = [Column.GAME_ID, Column.PERIOD_ID, Column.FRAME_ID, Column.TEAM_ID] + BY_OBJECT_PERIOD = [Column.OBJECT_ID, Column.PERIOD_ID] + + @dataclass class SoccerObject: id: Union[str, int] @@ -30,17 +68,19 @@ class SoccerObject: @dataclass class KloppyPolarsDataset(DefaultDataset): kloppy_dataset: TrackingDataset - ball_carrier_threshold: float = None - _identifier_column: str = field(default="id", init=False) + ball_carrier_threshold: float = 25.0 _graph_id_column: str = field(default="graph_id") _label_column: str = field(default="label") - _partition_by: List[str] = field( - default_factory=lambda: ["id", "period_id"], init=False - ) - _infer_ball_owning_team_id: bool = field(default=False, init=False) _overwrite_orientation: bool = field(default=False, init=False) _infer_goalkeepers: bool = field(default=False, init=False) - + + def __post_init__(self): + if not isinstance(self.kloppy_dataset, TrackingDataset): + raise Exception("'kloppy_dataset' should be of type float") + + if not isinstance(self.ball_carrier_threshold, float): + raise Exception("'ball_carrier_threshold' should be of type float") + def __transform_orientation(self): if not self.kloppy_dataset.metadata.flags & DatasetFlag.BALL_OWNING_TEAM: self._overwrite_orientation = True @@ -48,7 +88,7 @@ def __transform_orientation(self): to_orientation = Orientation.STATIC_HOME_AWAY else: to_orientation = Orientation.BALL_OWNING_TEAM - + self.kloppy_dataset = DatasetTransformer.transform_dataset( dataset=self.kloppy_dataset, to_orientation=to_orientation, @@ -62,11 +102,14 @@ def __transform_orientation(self): def __get_objects(self): def __artificial_game_id() -> str: from uuid import uuid4 + return str(uuid4()) - + home_team, away_team = self.kloppy_dataset.metadata.teams - - if all(item is None for item in [p.starting_position for p in home_team.players]): + + if all( + item is None for item in [p.starting_position for p in home_team.players] + ): self._infer_goalkeepers = True home_players = [ SoccerObject(p.player_id, p.team.team_id, None) @@ -85,7 +128,7 @@ def __artificial_game_id() -> str: SoccerObject(p.player_id, p.team.team_id, p.starting_position.code) for p in away_team.players ] - ball_object = SoccerObject("ball", "ball", "ball") + ball_object = SoccerObject(Constant.BALL, Constant.BALL, Constant.BALL) game_id = self.kloppy_dataset.metadata.game_id if game_id is None: game_id = __artificial_game_id() @@ -96,17 +139,17 @@ def __unpivot(self, object, coordinate): return self.data.unpivot( index=[ - "period_id", - "timestamp", - "frame_id", - "ball_state", - "ball_owning_team_id", + Column.PERIOD_ID, + Column.TIMESTAMP, + Column.FRAME_ID, + Column.BALL_STATE, + Column.BALL_OWNING_TEAM_ID, ], # Columns to keep on=[column], value_name=coordinate, - variable_name=self._identifier_column, + variable_name=Column.OBJECT_ID, ).with_columns( - pl.col(self._identifier_column).str.replace( + pl.col(Column.OBJECT_ID).str.replace( f"_{coordinate}", "" ) # Remove the coordinate suffix ) @@ -129,9 +172,13 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict): "Missing parameter 'polyorder' in player_smoothing_params and/or ball_smoothing_params" ) - smoothed = df.group_by(self._partition_by, maintain_order=True).agg( + vx_smooth = f"{Column.VX}_smoothed" + vy_smooth = f"{Column.VY}_smoothed" + vz_smooth = f"{Column.VZ}_smoothed" + + smoothed = df.group_by(Group.BY_OBJECT_PERIOD, maintain_order=True).agg( [ - pl.col("vx") + pl.col(Column.VX) .map_elements( lambda vx: savgol_filter( vx, @@ -140,8 +187,8 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict): ).tolist(), return_dtype=pl.List(pl.Float64), ) - .alias("vx_smoothed"), - pl.col("vy") + .alias(vx_smooth), + pl.col(Column.VY) .map_elements( lambda vy: savgol_filter( vy, @@ -150,8 +197,8 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict): ).tolist(), return_dtype=pl.List(pl.Float64), ) - .alias("vy_smoothed"), - pl.col("vz") + .alias(vy_smooth), + pl.col(Column.VZ) .map_elements( lambda vy: savgol_filter( vy, @@ -160,18 +207,16 @@ def __apply_smoothing(self, df: pl.DataFrame, smoothing_params: dict): ).tolist(), return_dtype=pl.List(pl.Float64), ) - .alias("vz_smoothed"), + .alias(vz_smooth), ] ) # Explode the smoothed columns back to original shape - smoothed_exploded = smoothed.explode( - ["vx_smoothed", "vy_smoothed", "vz_smoothed"] - ) + smoothed_exploded = smoothed.explode([vx_smooth, vy_smooth, vz_smooth]) # Combine with the original DataFrame if needed return df.with_columns( - vx=smoothed_exploded["vx_smoothed"], - vy=smoothed_exploded["vy_smoothed"], - vz=smoothed_exploded["vz_smoothed"], + vx=smoothed_exploded[vx_smooth], + vy=smoothed_exploded[vy_smooth], + vz=smoothed_exploded[vz_smooth], ) def __add_velocity( @@ -181,60 +226,65 @@ def __add_velocity( ball_smoothing_params: dict, ): df = ( - df.sort(["id", "period_id", "timestamp", "team_id"], nulls_last=True) + df.sort( + Group.BY_OBJECT_PERIOD + [Column.TIMESTAMP, Column.TEAM_ID], + nulls_last=True, + ) .with_columns( [ # Calculate differences within each group - pl.col("x").diff().over(self._partition_by).alias("dx"), - pl.col("y").diff().over(self._partition_by).alias("dy"), - pl.col("z").diff().over(self._partition_by).alias("dz"), - (pl.col("timestamp").dt.total_milliseconds() / 1_000) + pl.col(Column.X).diff().over(Group.BY_OBJECT_PERIOD).alias("dx"), + pl.col(Column.Y).diff().over(Group.BY_OBJECT_PERIOD).alias("dy"), + pl.col(Column.Z).diff().over(Group.BY_OBJECT_PERIOD).alias("dz"), + (pl.col(Column.TIMESTAMP).dt.total_milliseconds() / 1_000) .diff() - .over(self._partition_by) + .over(Group.BY_OBJECT_PERIOD) .alias("dt"), ] ) .with_columns( [ # Compute velocity components - (pl.col("dx") / pl.col("dt")).alias("vx"), - (pl.col("dy") / pl.col("dt")).alias("vy"), - (pl.col("dz") / pl.col("dt")).alias("vz"), + (pl.col("dx") / pl.col("dt")).alias(Column.VX), + (pl.col("dy") / pl.col("dt")).alias(Column.VY), + (pl.col("dz") / pl.col("dt")).alias(Column.VZ), ] ) .with_columns( [ # Fill null values in vx and vy - pl.col("vx").fill_null(0).alias("vx"), - pl.col("vy").fill_null(0).alias("vy"), - pl.col("vz").fill_null(0).alias("vz"), + pl.col(Column.VX).fill_null(0).alias(Column.VX), + pl.col(Column.VY).fill_null(0).alias(Column.VY), + pl.col(Column.VZ).fill_null(0).alias(Column.VZ), ] ) ) if player_smoothing_params: player_df = self.__apply_smoothing( - df=df.filter(pl.col(self._identifier_column) != self._ball_object.id), + df=df.filter(pl.col(Column.OBJECT_ID) != self._ball_object.id), smoothing_params=player_smoothing_params, ) else: - player_df = df.filter( - pl.col(self._identifier_column) != self._ball_object.id - ) + player_df = df.filter(pl.col(Column.OBJECT_ID) != self._ball_object.id) if ball_smoothing_params: ball_df = self.__apply_smoothing( - df.filter(pl.col(self._identifier_column) == self._ball_object.id), + df.filter(pl.col(Column.OBJECT_ID) == self._ball_object.id), smoothing_params=ball_smoothing_params, ) else: - ball_df = df.filter(pl.col(self._identifier_column) == self._ball_object.id) + ball_df = df.filter(pl.col(Column.OBJECT_ID) == self._ball_object.id) df = pl.concat([player_df, ball_df]) df = df.with_columns( [ - (pl.col("vx") ** 2 + pl.col("vy") ** 2 + pl.col("vz") ** 2) + ( + pl.col(Column.VX) ** 2 + + pl.col(Column.VY) ** 2 + + pl.col(Column.VZ) ** 2 + ) .sqrt() - .alias("v") + .alias(Column.V) ] ) @@ -245,33 +295,37 @@ def __add_acceleration(self, df: pl.DataFrame): df.with_columns( [ # Calculate differences in vx, vy, and dt for acceleration - pl.col("vx").diff().over(self._partition_by).alias("dvx"), - pl.col("vy").diff().over(self._partition_by).alias("dvy"), - pl.col("vz").diff().over(self._partition_by).alias("dvz"), + pl.col(Column.VX).diff().over(Group.BY_OBJECT_PERIOD).alias("dvx"), + pl.col(Column.VY).diff().over(Group.BY_OBJECT_PERIOD).alias("dvy"), + pl.col(Column.VZ).diff().over(Group.BY_OBJECT_PERIOD).alias("dvz"), ] ) .with_columns( [ # Compute ax and ay - (pl.col("dvx") / pl.col("dt")).alias("ax"), - (pl.col("dvy") / pl.col("dt")).alias("ay"), - (pl.col("dvz") / pl.col("dt")).alias("az"), + (pl.col("dvx") / pl.col("dt")).alias(Column.AX), + (pl.col("dvy") / pl.col("dt")).alias(Column.AY), + (pl.col("dvz") / pl.col("dt")).alias(Column.AZ), ] ) .with_columns( [ # Fill null values in vx and vy - pl.col("ax").fill_null(0).alias("ax"), - pl.col("ay").fill_null(0).alias("ay"), - pl.col("az").fill_null(0).alias("az"), + pl.col(Column.AX).fill_null(0).alias(Column.AX), + pl.col(Column.AY).fill_null(0).alias(Column.AY), + pl.col(Column.AZ).fill_null(0).alias(Column.AZ), ] ) .with_columns( [ # Compute magnitude of acceleration a - (pl.col("ax") ** 2 + pl.col("ay") ** 2 + pl.col("az") ** 2) + ( + pl.col(Column.AX) ** 2 + + pl.col(Column.AY) ** 2 + + pl.col(Column.AZ) ** 2 + ) .sqrt() - .alias("a") + .alias(Column.A) ] ) ) @@ -289,17 +343,19 @@ def __melt( for object in [ball_object] + home_players + away_players: melted_object_dfs = [] - for k, coordinate in enumerate(["x", "y", "z"]): - if object.id != "ball" and coordinate == "z": + for k, coordinate in enumerate([Column.X, Column.Y, Column.Z]): + if object.id != Constant.BALL and coordinate == Column.Z: continue if not any(object.id in column for column in columns): continue melted_df = self.__unpivot(object, coordinate) - - if object.id == "ball" and coordinate == "z": + + if object.id == Constant.BALL and coordinate == Column.Z: if melted_df[coordinate].is_null().all(): - melted_df = melted_df.with_columns([pl.lit(0.0).alias("z")]) + melted_df = melted_df.with_columns( + [pl.lit(0.0).alias(Column.Z)] + ) if k == 0: melted_object_dfs.append(melted_df) else: @@ -307,146 +363,188 @@ def __melt( if melted_object_dfs: object_df = pl.concat(melted_object_dfs, how="horizontal") - if "z" not in object_df.columns: - object_df = object_df.with_columns([pl.lit(0.0).alias("z")]) + if Column.Z not in object_df.columns: + object_df = object_df.with_columns([pl.lit(0.0).alias(Column.Z)]) object_df = object_df.with_columns( [ - pl.lit(object.team_id).cast(pl.Utf8).alias("team_id"), - pl.lit(object.position_name).alias("position_name"), + pl.lit(object.team_id).cast(pl.Utf8).alias(Column.TEAM_ID), + pl.lit(object.position_name).alias(Column.POSITION_NAME), ] ) melted_dfs.append(object_df) - + df = pl.concat(melted_dfs, how="vertical") - df = df.with_columns([pl.lit(game_id).alias("game_id")]) - df = df.sort(by=["period_id", "timestamp", "team_id"], nulls_last=True) - return df - - def __get_inferred_ball_owning_team_id(self, df: pl.DataFrame): - non_ball_owning_team = ( - df.filter(pl.col("ball_owning_team_id").is_null()) - ) - ball_owning_team = ( - df.filter(~pl.col("ball_owning_team_id").is_null()) - ) - - ball = ( - non_ball_owning_team.filter(pl.col('team_id') == "ball") - ) - players = ( - non_ball_owning_team.filter(pl.col('team_id') != "ball") + df = df.with_columns([pl.lit(game_id).alias(Column.GAME_ID)]) + df = df.sort( + by=[Column.PERIOD_ID, Column.TIMESTAMP, Column.TEAM_ID], nulls_last=True ) + return df + + def __infer_ball_carrier(self, df: pl.DataFrame): + if Column.BALL_OWNING_PLAYER_ID not in df.columns: + df = df.with_columns( + pl.lit(False) + .cast(df.schema[Column.OBJECT_ID]) + .alias(Column.BALL_OWNING_PLAYER_ID) + ) + + # handle the non ball owning frames + ball = df.filter(pl.col(Column.TEAM_ID) == Constant.BALL) + players = df.filter(pl.col(Column.TEAM_ID) != Constant.BALL) + + # ball owning team is empty, so we can drop it. Goal is to replace it result = ( - players.drop('ball_owning_team_id') - .join( + players.join( ball.select( - ['game_id', 'period_id', 'frame_id', - pl.col('x').alias('ball_x'), - pl.col('y').alias('ball_y'), - pl.col('z').alias('ball_z')] + Group.BY_FRAME + + [ + pl.col(Column.X).alias("ball_x"), + pl.col(Column.Y).alias("ball_y"), + pl.col(Column.Z).alias("ball_z"), + ] ), - on=['game_id', 'period_id', 'frame_id'], - how='left' + on=Group.BY_FRAME, + how="left", ) - .with_columns([ - ((pl.col('x') - pl.col('ball_x'))**2 + - (pl.col('y') - pl.col('ball_y'))**2 + - (pl.col('z') - pl.col('ball_z'))**2 - ).sqrt().alias('distance') - ]) - .group_by(['game_id', 'period_id', 'frame_id']) - .agg([ - pl.when(pl.col('distance').min() < self.ball_carrier_threshold) - .then(pl.col('team_id').filter(pl.col('distance') == pl.col('distance').min()).first()) - .otherwise(None) - .alias('ball_owning_team_id'), - pl.all().sort_by('distance').first() - ]) - ) - non_ball_owning_team = ( - non_ball_owning_team.drop('ball_owning_team_id') - .join( - result.select(['game_id', 'period_id', 'frame_id', 'ball_owning_team_id']), - on=['game_id', 'period_id', 'frame_id'], - how='left' + .with_columns( + [ + ( + (pl.col(Column.X) - pl.col("ball_x")) ** 2 + + (pl.col(Column.Y) - pl.col("ball_y")) ** 2 + + (pl.col(Column.Z) - pl.col("ball_z")) ** 2 + ) + .sqrt() + .alias("ball_dist") + ] ) - .filter( - ~pl.col("ball_owning_team_id").is_null() + .group_by(Group.BY_FRAME) + .agg( + [ + pl.when((pl.col(Column.BALL_OWNING_TEAM_ID).is_null())) + .then( + pl.col(Column.TEAM_ID) + .filter( + (pl.col("ball_dist") == pl.col("ball_dist").min()) + & (pl.col("ball_dist").min() < self.ball_carrier_threshold) + ) + .first() + ) + .otherwise(pl.col(Column.BALL_OWNING_TEAM_ID)) + .alias(Column.BALL_OWNING_TEAM_ID), + pl.when((pl.col(Column.BALL_OWNING_PLAYER_ID).is_null())) + .then( + pl.col(Column.OBJECT_ID) + .filter( + (pl.col("ball_dist") == pl.col("ball_dist").min()) + & (pl.col("ball_dist").min() < self.ball_carrier_threshold) + ) + .first() + ) + .otherwise(pl.col(Column.BALL_OWNING_PLAYER_ID)) + .alias(Column.BALL_OWNING_PLAYER_ID), + ] + ) + .with_columns( + [ + pl.col(Column.BALL_OWNING_PLAYER_ID) + .list.first() + .alias(Column.BALL_OWNING_PLAYER_ID), + pl.col(Column.BALL_OWNING_TEAM_ID) + .list.first() + .alias(Column.BALL_OWNING_TEAM_ID), + ] ) - .with_columns([ - pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id']) - ]) - .select(ball_owning_team.columns) - ) - ball_owning_team = ( - ball_owning_team - .with_columns([ - pl.col("ball_owning_team_id").cast(ball_owning_team.schema['team_id']) - ]) ) - - new_df = ( - pl.concat([ - ball_owning_team, - non_ball_owning_team - ], how="vertical") - .sort(['game_id', 'period_id', 'frame_id', 'team_id']) + df = ( + df.drop([Column.BALL_OWNING_PLAYER_ID, Column.BALL_OWNING_TEAM_ID]) + .join(result, how="left", on=Group.BY_FRAME) + .with_columns( + pl.when( + pl.col(Column.OBJECT_ID) == pl.col(Column.BALL_OWNING_PLAYER_ID) + ) + .then(True) + .otherwise(False) + .alias(Column.IS_BALL_CARRIER) + ) + .drop(Column.BALL_OWNING_PLAYER_ID) + .drop_nulls(subset=Column.BALL_OWNING_TEAM_ID) ) - return new_df - - def __get_inferred_goalkeepers(self, df: pl.DataFrame): + return df + + def __infer_goalkeepers(self, df: pl.DataFrame): goal_x = self.pitch_dimensions.pitch_length / 2 goal_y = 0 - - df_with_distances = ( - df.filter(pl.col('team_id') != "ball") - .with_columns([ - ((pl.col('x') - (-goal_x))**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_left'), - ((pl.col('x') - goal_x)**2 + (pl.col('y') - goal_y)**2).sqrt().alias('dist_right') - ]) + + df_with_distances = df.filter( + pl.col(Column.TEAM_ID) != Constant.BALL + ).with_columns( + [ + ((pl.col(Column.X) - (-goal_x)) ** 2 + (pl.col(Column.Y) - goal_y) ** 2) + .sqrt() + .alias("dist_left"), + ((pl.col(Column.X) - goal_x) ** 2 + (pl.col(Column.Y) - goal_y) ** 2) + .sqrt() + .alias("dist_right"), + ] ) result = ( - df_with_distances - .with_columns([ - pl.col('dist_left').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_left'), - pl.col('dist_right').min().over(['game_id', 'period_id', 'frame_id', 'team_id']).alias('min_dist_right') - ]) - .with_columns([ - pl.when(pl.col('team_id') == pl.col('ball_owning_team_id')) - .then( - pl.when(pl.col('dist_left') == pl.col('min_dist_left')) - .then(pl.lit('GK')) - .otherwise(None) - ) - .otherwise( - pl.when(pl.col('dist_right') == pl.col('min_dist_right')) - .then(pl.lit('GK')) - .otherwise(None) - ) - .alias('position_name') - ]) - .drop(['min_dist_left', 'min_dist_right', 'dist_left', 'dist_right']) + df_with_distances.with_columns( + [ + pl.col("dist_left") + .min() + .over(Group.BY_FRAME_TEAM) + .alias("min_dist_left"), + pl.col("dist_right") + .min() + .over(Group.BY_FRAME_TEAM) + .alias("min_dist_right"), + ] + ) + .with_columns( + [ + pl.when( + pl.col(Column.TEAM_ID) == pl.col(Column.BALL_OWNING_TEAM_ID) + ) + .then( + pl.when(pl.col("dist_left") == pl.col("min_dist_left")) + .then(pl.lit("GK")) + .otherwise(None) + ) + .otherwise( + pl.when(pl.col("dist_right") == pl.col("min_dist_right")) + .then(pl.lit("GK")) + .otherwise(None) + ) + .alias("position_name") + ] + ) + .drop(["min_dist_left", "min_dist_right", "dist_left", "dist_right"]) ) - ball_rows = df.filter(pl.col('team_id') == "ball") + ball_rows = df.filter(pl.col(Column.TEAM_ID) == Constant.BALL) non_ball_rows = result - return ( - pl.concat([ball_rows, non_ball_rows], how="vertical") - .sort(['game_id', 'period_id', 'frame_id', 'team_id']) + return pl.concat([ball_rows, non_ball_rows], how="vertical").sort( + Group.BY_FRAME_TEAM ) - - def __fix_orientation_to_ball_owning(self, df: pl.DataFrame, home_team_id: Union[str, int]): + + def __fix_orientation_to_ball_owning( + self, df: pl.DataFrame, home_team_id: Union[str, int] + ): # When _overwrite_orientation is True, it means the orientation is "STATIC_HOME_AWAY" # This means that when away is the attacking team we can flip all coordinates by -1.0 - - flip_columns = ['x', 'y', 'vx', 'vy', 'ax', 'ay'] - - return df.with_columns([ - pl.when(pl.col('ball_owning_team_id').cast(str) != str(home_team_id)) - .then(pl.col(flip_columns) * -1) - .otherwise(pl.col(flip_columns)) - ]) + + flip_columns = [Column.X, Column.Y, Column.VX, Column.VY, Column.AX, Column.AY] + + return df.with_columns( + [ + pl.when( + pl.col(Column.BALL_OWNING_TEAM_ID).cast(str) != str(home_team_id) + ) + .then(pl.col(flip_columns) * -1) + .otherwise(pl.col(flip_columns)) + ] + ) def load( self, @@ -454,8 +552,10 @@ def load( ball_smoothing_params: Union[dict, None] = DEFAULT_BALL_SMOOTHING_PARAMS, ): if self.kloppy_dataset.metadata.orientation == Orientation.NOT_SET: - raise ValueError("Data sources with an undefined orientation can not be used inside the 'unravelsports' package...") - + raise ValueError( + "Data sources with an undefined orientation can not be used inside the 'unravelsports' package..." + ) + self.kloppy_dataset = self.__transform_orientation() self.pitch_dimensions = self.kloppy_dataset.metadata.pitch_dimensions @@ -466,40 +566,39 @@ def load( df = self.__melt( self._home_players, self._away_players, self._ball_object, self._game_id ) - + df = self.__add_velocity(df, player_smoothing_params, ball_smoothing_params) df = self.__add_acceleration(df) df = df.drop(["dx", "dy", "dz", "dt", "dvx", "dvy", "dvz"]) - - df = df.filter( - ~(pl.col('x').is_null() & pl.col('y').is_null()) - ) - - if df['ball_owning_team_id'].is_null().all() and self.ball_carrier_threshold: - raise ValueError("This dataset requires us to infer the ball_owning_team_id, please specifiy a ball_carrier_threshold (float) to do so.") - - if self.ball_carrier_threshold is not None: - df = self.__get_inferred_ball_owning_team_id(df) - + + df = df.filter(~(pl.col(Column.X).is_null() & pl.col(Column.Y).is_null())) + + if ( + df[Column.BALL_OWNING_TEAM_ID].is_null().all() + and self.ball_carrier_threshold is None + ): + raise ValueError( + f"This dataset requires us to infer the {Column.BALL_OWNING_TEAM_ID}, please specifiy a ball_carrier_threshold (float) to do so." + ) + + df = self.__infer_ball_carrier(df) + if self._overwrite_orientation: home_team, _ = self.kloppy_dataset.metadata.teams - df = self.__fix_orientation_to_ball_owning(df, home_team_id=home_team.team_id) - + df = self.__fix_orientation_to_ball_owning( + df, home_team_id=home_team.team_id + ) + if self._infer_goalkeepers: - df = self.__get_inferred_goalkeepers(df) - + df = self.__infer_goalkeepers(df) + self.data = df return self.data, self.pitch_dimensions - def add_dummy_labels( - self, - by: List[str] = ["game_id", "frame_id"] - ) -> pl.DataFrame: + def add_dummy_labels(self, by: List[str] = ["game_id", "frame_id"]) -> pl.DataFrame: self.data = add_dummy_label_column(self.data, by, self._label_column) return self.data - def add_graph_ids( - self, by: List[str] = ["game_id", "period_id"] - ) -> pl.DataFrame: + def add_graph_ids(self, by: List[str] = ["game_id", "period_id"]) -> pl.DataFrame: self.data = add_graph_id_column(self.data, by, self._graph_id_column) return self.data diff --git a/unravel/soccer/graphs/features/adjacency_matrix_pl.py b/unravel/soccer/graphs/features/adjacency_matrix_pl.py index 7a5b2d2..2e27ea2 100644 --- a/unravel/soccer/graphs/features/adjacency_matrix_pl.py +++ b/unravel/soccer/graphs/features/adjacency_matrix_pl.py @@ -3,19 +3,21 @@ from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType, distance_to_ball +from ..dataset import Constant -def compute_adjacency_matrix_pl(team, possession_team, settings, ball_carrier_idx): +def compute_adjacency_matrix_pl(team, ball_owning_team, settings, ball_carrier_idx): adjacency_matrix_type = settings.adjacency_matrix_type adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type - ball_id = settings.ball_id + ball_id = Constant.BALL + + exclusion_ids = np.asarray([ball_id, *np.unique(ball_owning_team)]) - exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)]) defensive_team = np.setdiff1d(team, exclusion_ids)[0] if adjacency_matrix_type == AdjacencyMatrixType.DENSE: adjacency_matrix = np.ones((team.shape[0], team.shape[0])).astype(np.int32) elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_AP: - is_att = team == np.unique(possession_team)[0] + is_att = team == np.unique(ball_owning_team)[0] adjacency_matrix = np.outer(is_att, is_att).astype(int) elif adjacency_matrix_type == AdjacencyMatrixType.DENSE_DP: is_def = team == defensive_team diff --git a/unravel/soccer/graphs/features/node_features.py b/unravel/soccer/graphs/features/node_features.py index 7127404..dd532b0 100644 --- a/unravel/soccer/graphs/features/node_features.py +++ b/unravel/soccer/graphs/features/node_features.py @@ -54,7 +54,7 @@ def player_features(p, team, potential_receiver=None): ), ( 0.0 - if np.isnan(p.x1) + if np.isnan(p.y1) else normalize_coords(p.y1, pitch_dimensions.y_dim.max) ), 0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0], diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py index c1132b3..804ebd4 100644 --- a/unravel/soccer/graphs/features/node_features_pl.py +++ b/unravel/soccer/graphs/features/node_features_pl.py @@ -28,6 +28,7 @@ def compute_node_features_pl( team, possession_team, is_gk, + ball_carrier, settings, ): ball_id = settings.ball_id @@ -63,8 +64,8 @@ def compute_node_features_pl( uv_velocity = unit_vectors(velocity) angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0])) - sin_normed = normalize_sincos(np.sin(angles)) - cos_normed = normalize_sincos(np.cos(angles)) + v_sin_normed = normalize_sincos(np.sin(angles)) + v_cos_normed = normalize_sincos(np.cos(angles)) dist_to_goal = np.linalg.norm(position - goal_mouth_position, axis=1) normed_dist_to_goal = normalize_distance( @@ -75,6 +76,16 @@ def compute_node_features_pl( value=dist_to_ball, max_distance=max_dist_to_player ) + vec_to_goal = goal_mouth_position - position + angle_to_goal = np.arctan2(vec_to_goal[:, 1], vec_to_goal[:, 0]) + goal_sin_normed = normalize_sincos(np.sin(angle_to_goal)) + goal_cos_normed = normalize_sincos(np.cos(angle_to_goal)) + + vec_to_ball = ball_position - position + angle_to_ball = np.arctan2(vec_to_ball[:, 1], vec_to_ball[:, 0]) + ball_sin_normed = normalize_sincos(np.sin(angle_to_ball)) + ball_cos_normed = normalize_sincos(np.cos(angle_to_ball)) + is_possession_team = np.where( team == possession_team, 1, settings.defending_team_node_value ) @@ -86,159 +97,21 @@ def compute_node_features_pl( ( x_normed, y_normed, - uv_velocity[:, 0], - uv_velocity[:, 1], s_normed, - sin_normed, - cos_normed, + v_sin_normed, + v_cos_normed, normed_dist_to_goal, normed_dist_to_ball, is_possession_team, is_gk, is_ball, + goal_sin_normed, + goal_cos_normed, + ball_sin_normed, + ball_cos_normed, + ball_carrier, ), axis=-1, ) ) - return X - - -# def node_features( -# attacking_players, -# defending_players, -# ball, -# max_player_speed, -# max_ball_speed, -# ball_carrier_idx, -# pitch_dimensions, -# include_ball_node: bool = True, -# defending_team_node_value: float = 0.1, -# non_potential_receiver_node_value: float = 0.1, -# ): -# """ -# node features matrix is (n_nodes, n_node_features) (<=23, 17) -# each player (and optionally ball) is a node - -# player_features n_node_features must be equal to ball_features n_node_features -# """ - -# goal_mouth_position = ( -# pitch_dimensions.pitch_length, -# pitch_dimensions.pitch_width / 2, -# ) -# max_dist_to_player = np.sqrt( -# pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2 -# ) -# max_dist_to_goal = np.sqrt( -# pitch_dimensions.pitch_length**2 + pitch_dimensions.pitch_width**2 -# ) - -# def player_features(p, team, potential_receiver=None): -# ball_angle = math.atan2(p.y1 - ball.y1, p.x1 - ball.x1) -# goal_angle = math.atan2( -# p.y1 - goal_mouth_position[0], p.x1 - goal_mouth_position[1] -# ) - -# player_node_features = [ -# ( -# 0.0 -# if np.isnan(p.x1) -# else normalize_coords(p.x1, pitch_dimensions.x_dim.max) -# ), -# ( -# 0.0 -# if np.isnan(p.x1) -# else normalize_coords(p.y1, pitch_dimensions.y_dim.max) -# ), -# 0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[0], -# 0.0 if np.isnan(p.x1) else unit_vector(p.velocity)[1], -# ( -# 0.0 -# if np.isnan(p.x1) -# else round(normalize_speed(p.speed, max_speed=max_player_speed), 3) -# ), -# ( -# 0.0 -# if np.isnan(p.x1) -# else normalize_angles(np.arctan2(p.velocity[1], p.velocity[0])) -# ), -# ( -# 0.0 -# if np.isnan(p.x1) -# else normalize_distance( -# np.linalg.norm(p.position - goal_mouth_position), -# max_distance=max_dist_to_goal, -# ) -# ), # distance to the goal mouth -# 0.0 if np.isnan(p.x1) else normalize_angles(goal_angle), -# ( -# 0.0 -# if np.isnan(p.x1) -# else normalize_distance( -# np.linalg.norm(p.position - ball.position), -# max_distance=max_dist_to_player, -# ) -# ), # distance to the ball -# 0.0 if np.isnan(p.x1) else normalize_angles(ball_angle), -# 0.0 if np.isnan(p.x1) else team, -# # 1 if player is on same team but not in possession, 0.1 for all other players, 0.1 if the player is 'missing' -# ( -# 0.0 -# if np.isnan(p.x1) -# else 1.0 if potential_receiver else non_potential_receiver_node_value -# ), -# ] -# return player_node_features - -# def ball_features(ball): -# goal_angle = math.atan2( -# ball.y1 - goal_mouth_position[1], ball.x1 - goal_mouth_position[0] -# ) -# ball_node_features = [ -# normalize_coords(ball.x1, pitch_dimensions.x_dim.max), -# normalize_coords(ball.y1, pitch_dimensions.y_dim.max), -# unit_vector(ball.velocity)[0], -# unit_vector(ball.velocity)[1], -# round(normalize_speed(ball.speed, max_speed=max_ball_speed), 3), -# normalize_angles(np.arctan2(ball.velocity[1], ball.velocity[0])), -# normalize_distance( -# np.linalg.norm(ball.position - goal_mouth_position), -# max_distance=max_dist_to_goal, -# ), # distance to the goal mouth -# normalize_angles(goal_angle), -# # ball_angle 2x, ball_dist 2x, attacking_team 2x, ball carrier, potential receiver (all always 0 for ball) -# 0, -# 0, -# 0, -# 0, # , 0 -# ] - -# return np.asarray([ball_node_features]) - -# # loop over attacking players, grab ball_carrier, potential receiver and intended receiver -# ap_features = np.asarray( -# [ -# player_features(p, team=1, potential_receiver=(i != ball_carrier_idx)) -# for i, p in enumerate(attacking_players) -# ] -# ) - -# # loop over defending playres, we don't have ball_carrier, or receivers -# dp_features = np.asarray( -# [ -# player_features(p, team=defending_team_node_value) -# for i, p in enumerate(defending_players) -# ] -# ) - -# # compute ball features -# b_features = ball_features(ball) -# X = np.append(ap_features, dp_features, axis=0) - -# if include_ball_node: -# X = np.append(X, b_features, axis=0) - -# # convert np.NaN to 0 (zero) -# X = np.nan_to_num(X) -# return X diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index 252ff03..c1f835f 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -30,7 +30,7 @@ ) from .graph_settings_pl import GraphSettingsPolars -from .dataset import KloppyPolarsDataset +from .dataset import KloppyPolarsDataset, Column, Group, Constant from .features import ( compute_node_features_pl, compute_adjacency_matrix_pl, @@ -52,17 +52,7 @@ class SoccerGraphConverterPolars(DefaultGraphConverter): Attributes: dataset (TrackingDataset): Kloppy TrackingDataset. - labels (dict): Dict with a key per frame_id, like so {frame_id: True/False/1/0} - graph_id (str, int): Set a single id for the whole Kloppy dataset. - graph_ids (dict): Frame level control over graph ids. - - The graph_ids will be used to assign each graph an identifier. This identifier allows us to split the CustomSpektralDataset such that - all graphs with the same id are either all in the test, train or validation set to avoid leakage. It is recommended to either set graph_id (int, str) as - a match_id, or pass a dictionary into 'graph_ids' with exactly the same keys as 'labels' for more granualar control over the graph ids. - The latter can be useful when splitting graphs by possession or sequence id. In this case the dict would be {frame_id: sequence_id/possession_id}. - Note that sequence_id/possession_id should probably be unique for the whole dataset. Perhaps like so {frame_id: 'match_id-sequence_id'}. Defaults to None. - - ball_carrier_threshold (float): The distance threshold to determine the ball carrier. Defaults to 25.0. + chunk_size (int): Determines how many Graphs get processed simultanously. non_potential_receiver_node_value (float): Value between 0 and 1 to assign to the defing team players """ @@ -75,145 +65,169 @@ def __post_init__(self): self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions self.label_col = self.dataset._label_column self.graph_id_col = self.dataset._graph_id_column - - self.ball_carrier_threshold = self.dataset.ball_carrier_threshold + self.dataset = self.dataset.data self._sport_specific_checks() self.settings = self._apply_settings() self.dataset = self._apply_filters() - + if self.pad: self.dataset = self._apply_padding(df=self.dataset) - - @staticmethod - def _apply_padding(df: pl.DataFrame) -> pl.DataFrame: + + def _apply_padding(self, df: pl.DataFrame) -> pl.DataFrame: keep_columns = [ - 'timestamp', - 'ball_state', - 'position_name', - 'label', - 'graph_id' + Column.TIMESTAMP, + Column.BALL_STATE, + Column.POSITION_NAME, + self.label_col, + self.graph_id_col, ] empty_columns = [ - 'id', 'x', 'y', 'z', 'vx', 'vy', - 'vz', 'v', 'ax', 'ay', 'az', 'a' + Column.OBJECT_ID, + Column.IS_BALL_CARRIER, + Column.X, + Column.Y, + Column.Z, + Column.VX, + Column.VY, + Column.VZ, + Column.V, + Column.AX, + Column.AY, + Column.AZ, + Column.A, ] - group_by_columns = ['game_id', 'period_id', 'frame_id', 'team_id', 'ball_owning_team_id'] - - counts = ( - df.group_by(group_by_columns) - .agg( - pl.len().alias('count'), - *[pl.first(col).alias(col) for col in keep_columns] - ) + group_by_columns = [ + Column.GAME_ID, + Column.PERIOD_ID, + Column.FRAME_ID, + Column.TEAM_ID, + Column.BALL_OWNING_TEAM_ID, + ] + + counts = df.group_by(group_by_columns).agg( + pl.len().alias("count"), *[pl.first(col).alias(col) for col in keep_columns] ) - - counts = counts.with_columns([ - pl.when(pl.col('team_id') == "ball") - .then(1) - .when(pl.col('team_id') == pl.col('ball_owning_team_id')) - .then(11) - .otherwise(11) - .alias('target_length') - ]) - - groups_to_pad = ( - counts - .filter(pl.col('count') < pl.col('target_length')) - .with_columns( - (pl.col('target_length') - pl.col('count')).alias('repeats') - ) + + counts = counts.with_columns( + [ + pl.when(pl.col(Column.TEAM_ID) == Constant.BALL) + .then(1) + .when(pl.col(Column.TEAM_ID) == pl.col(Column.BALL_OWNING_TEAM_ID)) + .then(11) + .otherwise(11) + .alias("target_length") + ] ) - + + groups_to_pad = counts.filter( + pl.col("count") < pl.col("target_length") + ).with_columns((pl.col("target_length") - pl.col("count")).alias("repeats")) + if len(groups_to_pad) == 0: return df - + padding_rows = [] for row in groups_to_pad.iter_rows(named=True): base_row = {col: row[col] for col in keep_columns + group_by_columns} - padding_rows.extend([base_row] * row['repeats']) - + padding_rows.extend([base_row] * row["repeats"]) + padding_df = pl.DataFrame(padding_rows) - + schema = df.schema - padding_df = padding_df.with_columns([ - pl.lit(0.0 if schema[col] != pl.String else "None").cast(schema[col]).alias(col) - for col in empty_columns - ]) - - padding_df = padding_df.select(df.columns) - - result = pl.concat([df, padding_df], how='vertical') - - total_frames = ( - result.select(['game_id', 'period_id', 'frame_id']) - .unique() - .height + padding_df = padding_df.with_columns( + [ + pl.lit(0.0 if schema[col] != pl.String else "None") + .cast(schema[col]) + .alias(col) + for col in empty_columns + ] ) - + + padding_df = padding_df.select(df.columns) + + result = pl.concat([df, padding_df], how="vertical") + + total_frames = result.select(Group.BY_FRAME).unique().height + frame_completeness = ( - result.group_by(['game_id', 'period_id', 'frame_id']) - .agg([ - (pl.col('team_id').eq("ball").sum() == 1).alias('has_ball'), - (pl.col('team_id').eq(pl.col('ball_owning_team_id')).sum() == 11).alias('has_owning_team'), - ((~pl.col('team_id').eq("ball") & ~pl.col('team_id').eq(pl.col('ball_owning_team_id'))).sum() == 11).alias('has_other_team') - ]) + result.group_by(Group.BY_FRAME) + .agg( + [ + (pl.col(Column.TEAM_ID).eq(Constant.BALL).sum() == 1).alias( + "has_ball" + ), + ( + pl.col(Column.TEAM_ID) + .eq(pl.col(Column.BALL_OWNING_TEAM_ID)) + .sum() + == 11 + ).alias("has_owning_team"), + ( + ( + ~pl.col(Column.TEAM_ID).eq(Constant.BALL) + & ~pl.col(Column.TEAM_ID).eq( + pl.col(Column.BALL_OWNING_TEAM_ID) + ) + ).sum() + == 11 + ).alias("has_other_team"), + ] + ) .filter( - pl.col('has_ball') & pl.col('has_owning_team') & pl.col('has_other_team') + pl.col("has_ball") + & pl.col("has_owning_team") + & pl.col("has_other_team") ) ) - + complete_frames = frame_completeness.height - + dropped_frames = total_frames - complete_frames if dropped_frames > 0: import warnings + warnings.warn( f"""Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball. This operation dropped {dropped_frames} incomplete frames out of {total_frames} total frames ({(dropped_frames/total_frames)*100:.2f}%) """ ) - - return result.join( - frame_completeness, - on=['game_id', 'period_id', 'frame_id'], - how='inner' - ) + + return result.join(frame_completeness, on=Group.BY_FRAME, how="inner") def _apply_filters(self): return self.dataset.with_columns( pl.when( - (pl.col(self.settings._identifier_column) == self.settings.ball_id) - & (pl.col("v") > self.settings.max_ball_speed) + (pl.col(Column.OBJECT_ID) == Constant.BALL) + & (pl.col(Column.V) > self.settings.max_ball_speed) ) .then(self.settings.max_ball_speed) .when( - (pl.col(self.settings._identifier_column) != self.settings.ball_id) - & (pl.col("v") > self.settings.max_player_speed) + (pl.col(Column.OBJECT_ID) != Constant.BALL) + & (pl.col(Column.V) > self.settings.max_player_speed) ) .then(self.settings.max_player_speed) - .otherwise(pl.col("v")) - .alias("v") + .otherwise(pl.col(Column.V)) + .alias(Column.V) ).with_columns( pl.when( - (pl.col(self.settings._identifier_column) == self.settings.ball_id) - & (pl.col("a") > self.settings.max_ball_acceleration) + (pl.col(Column.OBJECT_ID) == Constant.BALL) + & (pl.col(Column.A) > self.settings.max_ball_acceleration) ) .then(self.settings.max_ball_acceleration) .when( - (pl.col(self.settings._identifier_column) != self.settings.ball_id) - & (pl.col("a") > self.settings.max_player_acceleration) + (pl.col(Column.OBJECT_ID) != Constant.BALL) + & (pl.col(Column.A) > self.settings.max_player_acceleration) ) .then(self.settings.max_player_acceleration) - .otherwise(pl.col("a")) - .alias("a") + .otherwise(pl.col(Column.A)) + .alias(Column.A) ) def _apply_settings(self): return GraphSettingsPolars( pitch_dimensions=self.pitch_dimensions, - ball_carrier_treshold=self.ball_carrier_threshold, max_player_speed=self.max_player_speed, max_ball_speed=self.max_ball_speed, max_player_acceleration=self.max_player_acceleration, @@ -249,73 +263,83 @@ def _sport_specific_checks(self): "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." ) - if self.ball_carrier_threshold and not isinstance( - self.ball_carrier_threshold, float - ): - raise Exception("'ball_carrier_threshold' should be of type float") - if self.non_potential_receiver_node_value and not isinstance( self.non_potential_receiver_node_value, float ): raise Exception( "'non_potential_receiver_node_value' should be of type float" ) - + @property def __exprs_variables(self): return [ - "x", "y", "z", - "v", "vx", "vy", "vz", - "a", "ax", "ay", "az", - "team_id", "position_name", "ball_owning_team_id", + Column.X, + Column.Y, + Column.Z, + Column.V, + Column.VX, + Column.VY, + Column.VZ, + Column.A, + Column.AX, + Column.AY, + Column.AZ, + Column.TEAM_ID, + Column.POSITION_NAME, + Column.BALL_OWNING_TEAM_ID, + Column.IS_BALL_CARRIER, self.graph_id_col, self.label_col, ] - + def __compute(self, args: List[pl.Series]) -> dict: d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} - + if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]): raise Exception( "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) - if not self.prediction and not np.all(d[self.label_col] == d[self.label_col][0]): + if not self.prediction and not np.all( + d[self.label_col] == d[self.label_col][0] + ): raise Exception( """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, make sure this is not the case. Each group can only have 1 label.""" ) - - ball_carrier_idx = get_ball_carrier_idx( - x=d['x'], y=d['y'], z=d['z'], - team=d['team_id'], - possession_team=d['ball_owning_team_id'], - ball_id=self.settings.ball_id, - threshold=self.settings.ball_carrier_treshold, - ) + ball_carriers = np.where(d[Column.IS_BALL_CARRIER] == True)[0] + if len(ball_carriers) == 0: + ball_carrier_idx = None + else: + ball_carrier_idx = ball_carriers[0] + adjacency_matrix = compute_adjacency_matrix_pl( - team=d['team_id'], - possession_team=d['ball_owning_team_id'], + team=d[Column.TEAM_ID], + ball_owning_team=d[Column.BALL_OWNING_TEAM_ID], settings=self.settings, ball_carrier_idx=ball_carrier_idx, ) + + velocity = np.stack((d[Column.VX], d[Column.VY]), axis=-1) edge_features = compute_edge_features_pl( adjacency_matrix=adjacency_matrix, - p3d=np.stack((d['x'], d['y'], d['z']), axis=-1), - p2d=np.stack((d['x'], d['y']), axis=-1), - s=d['v'], - velocity=np.stack((d['vx'], d['vy']), axis=-1), - team=d['team_id'], + p3d=np.stack((d[Column.X], d[Column.Y], d[Column.Z]), axis=-1), + p2d=np.stack((d[Column.X], d[Column.Y]), axis=-1), + s=d[Column.V], + velocity=velocity, + team=d[Column.TEAM_ID], settings=self.settings, ) + node_features = compute_node_features_pl( - d['x'], - d['y'], - s=d['v'], - velocity=np.stack((d['vx'], d['vy']), axis=-1), - team=d['team_id'], - possession_team=d['ball_owning_team_id'], - is_gk=(d['position_name'] == self.settings.goalkeeper_id).astype(int), + d[Column.X], + d[Column.Y], + s=d[Column.V], + velocity=velocity, + team=d[Column.TEAM_ID], + possession_team=d[Column.BALL_OWNING_TEAM_ID], + is_gk=(d[Column.POSITION_NAME] == self.settings.goalkeeper_id).astype(int), + ball_carrier=d[Column.IS_BALL_CARRIER], settings=self.settings, ) return { @@ -337,11 +361,9 @@ def __compute(self, args: List[pl.Series]) -> dict: self.graph_id_col: d[self.graph_id_col][0], self.label_col: d[self.label_col][0], } - + def _convert(self): - result_df = self.dataset.group_by( - ["game_id", "frame_id"], maintain_order=True - ).agg( + result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg( pl.map_groups( exprs=self.__exprs_variables, function=self.__compute, @@ -369,8 +391,6 @@ def _convert(self): ) return graph_df.drop("result_dict") - - def to_graph_frames(self) -> List[dict]: def __convert_to_graph_data_list(df): @@ -406,10 +426,10 @@ def __convert_to_graph_data_list(df): graph_list.extend(chunk_graph_list) return graph_list - + graph_df = self._convert() - self.graph_frames = self.__convert_to_graph_data_list(graph_df) - + self.graph_frames = __convert_to_graph_data_list(graph_df) + return self.graph_frames def to_spektral_graphs(self) -> List[Graph]: diff --git a/unravel/soccer/graphs/graph_settings_pl.py b/unravel/soccer/graphs/graph_settings_pl.py index 4e934a9..a7713f4 100644 --- a/unravel/soccer/graphs/graph_settings_pl.py +++ b/unravel/soccer/graphs/graph_settings_pl.py @@ -6,10 +6,12 @@ from kloppy.domain import Dimension, Unit, MetricPitchDimensions from typing import Optional +from .dataset import Constant + @dataclass class GraphSettingsPolars(DefaultGraphSettings): - ball_id: str = "ball" + ball_id: str = Constant.BALL goalkeeper_id: str = "GK" boundary_correction: float = None non_potential_receiver_node_value: float = 0.1 @@ -17,7 +19,6 @@ class GraphSettingsPolars(DefaultGraphSettings): pitch_dimensions: MetricPitchDimensions = field( init=False, repr=False, default_factory=MetricPitchDimensions ) - _identifier_column: str = field(default="id", init=False) def __post_init__(self): self._sport_specific_checks() diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py index 282fe61..c11e8f3 100644 --- a/unravel/utils/features/utils.py +++ b/unravel/utils/features/utils.py @@ -204,7 +204,7 @@ def distance_to_ball( def get_ball_carrier_idx(x, y, z, team, possession_team, ball_id, threshold): _, _, dist_to_ball = distance_to_ball(x=x, y=y, z=z, team=team, ball_id=ball_id) - + print(dist_to_ball) filtered_distances = np.where( (team != possession_team) | (dist_to_ball <= threshold), np.inf, dist_to_ball ) From bd2a63687ba29e01c7df6bb1e91b415adddf244d Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 12:13:32 +0100 Subject: [PATCH 04/10] Qhull error --- unravel/soccer/graphs/graph_converter.py | 23 ++- unravel/soccer/graphs/graph_converter_pl.py | 204 ++++++++++++++------ 2 files changed, 155 insertions(+), 72 deletions(-) diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py index 4eded6e..31c093b 100644 --- a/unravel/soccer/graphs/graph_converter.py +++ b/unravel/soccer/graphs/graph_converter.py @@ -2,6 +2,8 @@ import sys from copy import deepcopy +from scipy.spatial.qhull import QhullError + import warnings from dataclasses import dataclass, field, asdict @@ -238,15 +240,18 @@ def to_graph_frames(self) -> dict: for frame in tqdm(self.dataset, desc="Processing frames"): data, label, frame_id, graph_id = self._convert(frame) if data.home_players and data.away_players: - gnn_frame = GraphFrame( - frame_id=frame_id, - data=data, - label=label, - graph_id=graph_id, - settings=self.settings, - ) - if gnn_frame.graph_data: - self.graph_frames.append(gnn_frame) + try: + gnn_frame = GraphFrame( + frame_id=frame_id, + data=data, + label=label, + graph_id=graph_id, + settings=self.settings, + ) + if gnn_frame.graph_data: + self.graph_frames.append(gnn_frame) + except QhullError: + pass return self.graph_frames diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index c1f835f..077e296 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -73,9 +73,23 @@ def __post_init__(self): self.dataset = self._apply_filters() if self.pad: - self.dataset = self._apply_padding(df=self.dataset) + self.dataset = self._apply_padding() + + self._shuffle() + + def _shuffle(self): + if isinstance(self.settings.random_seed, int): + self.dataset = self.dataset.sample( + fraction=1.0, seed=self.settings.random_seed + ) + elif self.settings.random_seed == True: + self.dataset = self.dataset.sample(fraction=1.0) + else: + pass + + def _apply_padding(self) -> pl.DataFrame: + df = self.dataset - def _apply_padding(self, df: pl.DataFrame) -> pl.DataFrame: keep_columns = [ Column.TIMESTAMP, Column.BALL_STATE, @@ -342,6 +356,7 @@ def __compute(self, args: List[pl.Series]) -> dict: ball_carrier=d[Column.IS_BALL_CARRIER], settings=self.settings, ) + return { "e": pl.Series( [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) @@ -362,76 +377,139 @@ def __compute(self, args: List[pl.Series]) -> dict: self.label_col: d[self.label_col][0], } + # def _convert(self): + # result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg( + # pl.map_groups( + # exprs=self.__exprs_variables, + # function=self.__compute, + # ).alias("result_dict") + # ) + + # graph_df = result_df.with_columns( + # [ + # pl.col("result_dict").struct.field("a").alias("a"), + # pl.col("result_dict").struct.field("e").alias("e"), + # pl.col("result_dict").struct.field("x").alias("x"), + # pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"), + # pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"), + # pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"), + # pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"), + # pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"), + # pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"), + # pl.col("result_dict") + # .struct.field(self.graph_id_col) + # .alias(self.graph_id_col), + # pl.col("result_dict") + # .struct.field(self.label_col) + # .alias(self.label_col), + # ] + # ) + + # return graph_df.drop("result_dict") + + # def to_graph_frames(self) -> List[dict]: + # def __convert_to_graph_data_list(df): + # lazy_df = df.lazy() + + # graph_list = [] + + # for chunk in lazy_df.collect().iter_slices(self.chunk_size): + # chunk_graph_list = [ + # { + # "a": make_sparse( + # flatten_to_reshaped_array( + # arr=chunk["a"][i], + # s0=chunk["a_shape_0"][i], + # s1=chunk["a_shape_1"][i], + # ) + # ), + # "x": flatten_to_reshaped_array( + # arr=chunk["x"][i], + # s0=chunk["x_shape_0"][i], + # s1=chunk["x_shape_1"][i], + # ), + # "e": flatten_to_reshaped_array( + # arr=chunk["e"][i], + # s0=chunk["e_shape_0"][i], + # s1=chunk["e_shape_1"][i], + # ), + # "y": np.asarray([chunk[self.label_col][i]]), + # "id": chunk[self.graph_id_col][i], + # } + # for i in range(len(chunk["a"])) + # ] + # graph_list.extend(chunk_graph_list) + + # return graph_list + + # graph_df = self._convert() + # self.graph_frames = __convert_to_graph_data_list(graph_df) + + # return self.graph_frames + + ### def _convert(self): - result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg( - pl.map_groups( - exprs=self.__exprs_variables, - function=self.__compute, - ).alias("result_dict") - ) - - graph_df = result_df.with_columns( - [ - pl.col("result_dict").struct.field("a").alias("a"), - pl.col("result_dict").struct.field("e").alias("e"), - pl.col("result_dict").struct.field("x").alias("x"), - pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"), - pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"), - pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"), - pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"), - pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"), - pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"), - pl.col("result_dict") - .struct.field(self.graph_id_col) - .alias(self.graph_id_col), - pl.col("result_dict") - .struct.field(self.label_col) - .alias(self.label_col), - ] + # Group and aggregate in one step + return ( + self.dataset.group_by(Group.BY_FRAME, maintain_order=True) + .agg( + pl.map_groups( + exprs=self.__exprs_variables, function=self.__compute + ).alias("result_dict") + ) + .with_columns( + [ + *[ + pl.col("result_dict").struct.field(f).alias(f) + for f in ["a", "e", "x", self.graph_id_col, self.label_col] + ], + *[ + pl.col("result_dict") + .struct.field(f"{m}_shape_{i}") + .alias(f"{m}_shape_{i}") + for m in ["a", "e", "x"] + for i in [0, 1] + ], + ] + ) + .drop("result_dict") ) - return graph_df.drop("result_dict") + @staticmethod + def _reshape_array(arr, s0, s1): + return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1) def to_graph_frames(self) -> List[dict]: - def __convert_to_graph_data_list(df): - lazy_df = df.lazy() - - graph_list = [] - - for chunk in lazy_df.collect().iter_slices(self.chunk_size): - chunk_graph_list = [ - { - "a": make_sparse( - flatten_to_reshaped_array( - arr=chunk["a"][i], - s0=chunk["a_shape_0"][i], - s1=chunk["a_shape_1"][i], - ) - ), - "x": flatten_to_reshaped_array( - arr=chunk["x"][i], - s0=chunk["x_shape_0"][i], - s1=chunk["x_shape_1"][i], - ), - "e": flatten_to_reshaped_array( - arr=chunk["e"][i], - s0=chunk["e_shape_0"][i], - s1=chunk["e_shape_1"][i], - ), - "y": np.asarray([chunk[self.label_col][i]]), - "id": chunk[self.graph_id_col][i], - } - for i in range(len(chunk["a"])) - ] - graph_list.extend(chunk_graph_list) - - return graph_list + def process_chunk(chunk: pl.DataFrame) -> List[dict]: + return [ + { + "a": make_sparse( + self._reshape_array( + chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i] + ) + ), + "x": self._reshape_array( + chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i] + ), + "e": self._reshape_array( + chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i] + ), + "y": np.asarray([chunk[self.label_col][i]]), + "id": chunk[self.graph_id_col][i], + } + for i in range(len(chunk)) + ] graph_df = self._convert() - self.graph_frames = __convert_to_graph_data_list(graph_df) - + self.graph_frames = [ + graph + for chunk in graph_df.lazy().collect().iter_slices(self.chunk_size) + for graph in process_chunk(chunk) + ] return self.graph_frames + ### + def to_spektral_graphs(self) -> List[Graph]: if not self.graph_frames: self.to_graph_frames() From 310816d9fd3e050bb1765db805e5c24ce4140101 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 13:25:27 +0100 Subject: [PATCH 05/10] reworked NFL --- tests/test_bigdb.py | 17 +- tests/test_kloppy_polars.py | 27 -- tests/test_spektral.py | 8 +- unravel/american_football/graphs/dataset.py | 152 ++++++---- .../graphs/features/adjacency_matrix.py | 3 +- .../graphs/features/edge_features.py | 3 + .../graphs/features/node_features.py | 12 +- .../graphs/graph_converter.py | 287 ++++++++---------- .../graphs/graph_settings.py | 2 - unravel/soccer/graphs/dataset.py | 28 +- .../graphs/features/edge_features_pl.py | 3 + .../graphs/features/node_features_pl.py | 5 +- unravel/soccer/graphs/graph_converter_pl.py | 132 ++------ unravel/utils/features/utils.py | 16 +- unravel/utils/objects/default_dataset.py | 5 +- .../utils/objects/default_graph_converter.py | 3 + 16 files changed, 309 insertions(+), 394 deletions(-) diff --git a/tests/test_bigdb.py b/tests/test_bigdb.py index adedd3e..eb2caca 100644 --- a/tests/test_bigdb.py +++ b/tests/test_bigdb.py @@ -20,9 +20,8 @@ AmericanFootballGraphConverter, AmericanFootballPitchDimensions, ) +from unravel.american_football.graphs.dataset import Constant from unravel.utils import ( - add_graph_id_column, - add_dummy_label_column, flatten_to_reshaped_array, make_sparse, CustomSpektralDataset, @@ -53,10 +52,8 @@ def dataset(self, coordinates: str, players: str, plays: str): plays_file_path=plays, ) bdb_dataset.load() - bdb_dataset.add_graph_ids(by=["gameId", "playId"], column_name="graph_id") - bdb_dataset.add_dummy_labels( - by=["gameId", "playId", "frameId"], column_name="label" - ) + bdb_dataset.add_graph_ids(by=["gameId", "playId"]) + bdb_dataset.add_dummy_labels(by=["gameId", "playId", "frameId"]) return bdb_dataset @pytest.fixture @@ -141,8 +138,6 @@ def node_feature_values(self): @pytest.fixture def arguments(self): return dict( - label_col="label", - graph_id_col="graph_id", max_player_speed=8.0, max_ball_speed=28.0, max_player_acceleration=10.0, @@ -161,8 +156,6 @@ def arguments(self): @pytest.fixture def non_default_arguments(self): return dict( - label_col="label", - graph_id_col="graph_id", max_player_speed=12.0, max_ball_speed=24.0, max_player_acceleration=11.0, @@ -199,8 +192,8 @@ def test_settings(self, gnnc_non_default, non_default_arguments): assert settings.pitch_dimensions.y_dim.min == -26.65 assert settings.pitch_dimensions.end_zone == 50.0 - assert settings.ball_id == "football" - assert settings.qb_id == "QB" + assert Constant.BALL == "football" + assert Constant.QB == "QB" assert settings.max_height == 225.0 assert settings.min_height == 150.0 assert settings.max_weight == 200.0 diff --git a/tests/test_kloppy_polars.py b/tests/test_kloppy_polars.py index 4d70d2a..df31b1a 100644 --- a/tests/test_kloppy_polars.py +++ b/tests/test_kloppy_polars.py @@ -75,11 +75,6 @@ def spc_padding( def soccer_polars_converter( self, kloppy_polars_dataset: KloppyPolarsDataset ) -> SoccerGraphConverterPolars: - # TODO: - # check if - # - random_seed - # - padding needs to be per team_id otherwise stuff breaks - # all work as expected and/or should be moved to the KloppyPolarsDataset return SoccerGraphConverterPolars( dataset=kloppy_polars_dataset, @@ -99,28 +94,6 @@ def soccer_polars_converter( verbose=False, ) - # @pytest.fixture() - # def gnnc_padding_random(self, dataset: TrackingDataset) -> SoccerGraphConverter: - # return SoccerGraphConverter( - # dataset=dataset, - # labels=dummy_labels(dataset), - # # settings - # ball_carrier_treshold=25.0, - # max_player_speed=12.0, - # max_ball_speed=28.0, - # boundary_correction=None, - # self_loop_ball=False, - # adjacency_matrix_connect_type="ball", - # adjacency_matrix_type="split_by_team", - # label_type="binary", - # defending_team_node_value=0.0, - # non_potential_receiver_node_value=0.1, - # infer_ball_ownership=True, - # infer_goalkeepers=True, - # random_seed=42, - # pad=True, - # verbose=False, - # ) def test_padding(self, spc_padding: SoccerGraphConverterPolars): spektral_graphs = spc_padding.to_spektral_graphs() diff --git a/tests/test_spektral.py b/tests/test_spektral.py index b170970..6e14ae4 100644 --- a/tests/test_spektral.py +++ b/tests/test_spektral.py @@ -45,10 +45,8 @@ def bdb_dataset(self, coordinates: str, players: str, plays: str): plays_file_path=plays, ) bdb_dataset.load() - bdb_dataset.add_graph_ids(by=["gameId", "playId"], column_name="graph_id") - bdb_dataset.add_dummy_labels( - by=["gameId", "playId", "frameId"], column_name="label" - ) + bdb_dataset.add_graph_ids(by=["gameId", "playId"]) + bdb_dataset.add_dummy_labels(by=["gameId", "playId", "frameId"]) return bdb_dataset @pytest.fixture @@ -122,8 +120,6 @@ def bdb_converter( ) -> AmericanFootballGraphConverter: return AmericanFootballGraphConverter( dataset=bdb_dataset, - label_col="label", - graph_id_col="graph_id", max_player_speed=8.0, max_ball_speed=28.0, max_player_acceleration=10.0, diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py index fdb7310..4b8ccff 100644 --- a/unravel/american_football/graphs/dataset.py +++ b/unravel/american_football/graphs/dataset.py @@ -10,22 +10,52 @@ from ...utils import DefaultDataset, add_dummy_label_column, add_graph_id_column +class Constant: + BALL = "football" + QB = "QB" + + +class Column: + OBJECT_ID = "nflId" + + GAME_ID = "gameId" + FRAME_ID = "frameId" + PLAY_ID = "playId" + + X = "x" + Y = "y" + + ACCELERATION = "a" + SPEED = "s" + ORIENTATION = "o" + DIRECTION = "dir" + TEAM = "team" + CLUB = "club" + OFFICIAL_POSITION = "officialPosition" + POSSESSION_TEAM = "possessionTeam" + HEIGHT_CM = "height_cm" + WEIGHT_KG = "weight_kg" + + +class Group: + BY_FRAME = [Column.GAME_ID, Column.PLAY_ID, Column.FRAME_ID] + BY_PLAY_POSSESSION_TEAM = [Column.GAME_ID, Column.PLAY_ID, Column.POSSESSION_TEAM] + + @dataclass class BigDataBowlDataset(DefaultDataset): - tracking_file_path: str - players_file_path: str - plays_file_path: str - pitch_dimensions: AmericanFootballPitchDimensions = field( - init=False, repr=False, default_factory=AmericanFootballPitchDimensions - ) - - def __post_init__(self): - if ( - not self.tracking_file_path - or not self.players_file_path - or not self.plays_file_path - ): - raise Exception("Missing data file path...") + def __init__( + self, + tracking_file_path: str, + players_file_path: str, + plays_file_path: str, + **kwargs, + ): + super().__init__(**kwargs) + self.tracking_file_path = tracking_file_path + self.players_file_path = players_file_path + self.plays_file_path = plays_file_path + self.pitch_dimensions = AmericanFootballPitchDimensions() def load(self): pitch_length = self.pitch_dimensions.pitch_length @@ -42,48 +72,51 @@ def load(self): play_direction = "left" if "club" in df.columns: - df = df.with_columns(pl.col("club").alias("team")) - df = df.drop("club") + df = df.with_columns(pl.col(Column.CLUB).alias(Column.TEAM)) + df = df.drop(Column.CLUB) df = ( df.with_columns( pl.when(pl.col("playDirection") == play_direction) - .then(pl.col("o") + 180) # rotate 180 degrees - .otherwise(pl.col("o")) - .alias("o"), + .then(pl.col(Column.ORIENTATION) + 180) # rotate 180 degrees + .otherwise(pl.col(Column.ORIENTATION)) + .alias(Column.ORIENTATION), pl.when(pl.col("playDirection") == play_direction) - .then(pl.col("dir") + 180) # rotate 180 degrees - .otherwise(pl.col("dir")) - .alias("dir"), + .then(pl.col(Column.DIRECTION) + 180) # rotate 180 degrees + .otherwise(pl.col(Column.DIRECTION)) + .alias(Column.DIRECTION), ) .with_columns( [ - (pl.col("x") - (pitch_length / 2)).alias("x"), - (pl.col("y") - (pitch_width / 2)).alias("y"), + (pl.col(Column.X) - (pitch_length / 2)).alias(Column.X), + (pl.col(Column.Y) - (pitch_width / 2)).alias(Column.Y), # convert to radian on (-pi, pi) range - (((pl.col("o") * np.pi / 180) + np.pi) % (2 * np.pi) - np.pi).alias( - "o" - ), ( - ((pl.col("dir") * np.pi / 180) + np.pi) % (2 * np.pi) - np.pi - ).alias("dir"), + ((pl.col(Column.ORIENTATION) * np.pi / 180) + np.pi) + % (2 * np.pi) + - np.pi + ).alias(Column.ORIENTATION), + ( + ((pl.col(Column.DIRECTION) * np.pi / 180) + np.pi) % (2 * np.pi) + - np.pi + ).alias(Column.DIRECTION), ] ) .with_columns( [ pl.when(pl.col("playDirection") == play_direction) - .then(pl.col("x") * -1.0) - .otherwise(pl.col("x")) - .alias("x"), + .then(pl.col(Column.X) * -1.0) + .otherwise(pl.col(Column.X)) + .alias(Column.X), pl.when(pl.col("playDirection") == play_direction) - .then(pl.col("y") * -1.0) - .otherwise(pl.col("y")) - .alias("y"), + .then(pl.col(Column.Y) * -1.0) + .otherwise(pl.col(Column.Y)) + .alias(Column.Y), # set "football" to nflId -9999 for ordering purposes - pl.when(pl.col("team") == "football") + pl.when(pl.col(Column.TEAM) == Constant.BALL) .then(-9999.9) - .otherwise(pl.col("nflId")) - .alias("nflId"), + .otherwise(pl.col(Column.OBJECT_ID)) + .alias(Column.OBJECT_ID), ] ) ) @@ -96,11 +129,15 @@ def load(self): ignore_errors=True, ) if "position" in players.columns: - players = players.with_columns(pl.col("position").alias("officialPosition")) + players = players.with_columns( + pl.col("position").alias(Column.OFFICIAL_POSITION) + ) players = players.drop("position") players = players.with_columns( - pl.col("nflId").cast(pl.Float64, strict=False).alias("nflId") + pl.col(Column.OBJECT_ID) + .cast(pl.Float64, strict=False) + .alias(Column.OBJECT_ID) ) players = self._convert_weight_height_to_metric(df=players) @@ -113,13 +150,22 @@ def load(self): ) df = df.join( - (players.select(["nflId", "officialPosition", "height_cm", "weight_kg"])), - on="nflId", + ( + players.select( + [ + Column.OBJECT_ID, + Column.OFFICIAL_POSITION, + Column.HEIGHT_CM, + Column.WEIGHT_KG, + ] + ) + ), + on=Column.OBJECT_ID, how="left", ) df = df.join( - (plays.select(["gameId", "playId", "possessionTeam"])), - on=["gameId", "playId"], + (plays.select(Group.BY_PLAY_POSSESSION_TEAM)), + on=[Column.GAME_ID, Column.PLAY_ID], how="left", ) self.data = df @@ -137,17 +183,13 @@ def load(self): return self.data, self.pitch_dimensions def add_dummy_labels( - self, - by: List[str] = ["gameId", "playId", "frameId"], - column_name: str = "label", + self, by: List[str] = ["gameId", "playId", "frameId"] ) -> pl.DataFrame: - self.data = add_dummy_label_column(self.data, by, column_name) + self.data = add_dummy_label_column(self.data, by, self._label_column) return self.data - def add_graph_ids( - self, by: List[str] = ["gameId", "playId"], column_name: str = "graph_id" - ) -> pl.DataFrame: - self.data = add_graph_id_column(self.data, by, column_name) + def add_graph_ids(self, by: List[str] = ["gameId", "playId"]) -> pl.DataFrame: + self.data = add_graph_id_column(self.data, by, self._graph_id_column) return self.data @staticmethod @@ -166,9 +208,11 @@ def _convert_weight_height_to_metric(df: pl.DataFrame): ) df = df.with_columns( [ - (pl.col("feet") * 30.48 + pl.col("inches") * 2.54).alias("height_cm"), + (pl.col("feet") * 30.48 + pl.col("inches") * 2.54).alias( + Column.HEIGHT_CM + ), (pl.col("weight") * 0.453592).alias( - "weight_kg" + Column.WEIGHT_KG ), # Convert pounds to kilograms ] ).drop(["height", "feet", "inches", "weight"]) diff --git a/unravel/american_football/graphs/features/adjacency_matrix.py b/unravel/american_football/graphs/features/adjacency_matrix.py index 130cd0f..4272871 100644 --- a/unravel/american_football/graphs/features/adjacency_matrix.py +++ b/unravel/american_football/graphs/features/adjacency_matrix.py @@ -1,12 +1,13 @@ import numpy as np from ....utils import AdjacencyMatrixType, AdjacenyMatrixConnectType +from ..dataset import Constant def compute_adjacency_matrix(team, possession_team, settings): adjacency_matrix_type = settings.adjacency_matrix_type adjacency_matrix_connect_type = settings.adjacency_matrix_connect_type - ball_id = settings.ball_id + ball_id = Constant.BALL exclusion_ids = np.asarray([ball_id, *np.unique(possession_team)]) defensive_team = np.setdiff1d(team, exclusion_ids)[0] diff --git a/unravel/american_football/graphs/features/edge_features.py b/unravel/american_football/graphs/features/edge_features.py index 7ff3081..78f491c 100644 --- a/unravel/american_football/graphs/features/edge_features.py +++ b/unravel/american_football/graphs/features/edge_features.py @@ -8,6 +8,7 @@ normalize_speed_differences_nfl, normalize_accelerations_nfl, ) +from ..dataset import Constant def compute_edge_features(adjacency_matrix, p, s, a, o, dir, team, settings): @@ -26,12 +27,14 @@ def compute_edge_features(adjacency_matrix, p, s, a, o, dir, team, settings): speed_diff_matrix_normed = normalize_speed_differences_nfl( s=speed_diff_matrix, team=team, + ball_id=Constant.BALL, settings=settings, ) acc_diff_matrix = np.nan_to_num(a[None, :] - a[:, None]) # NxNx1 acc_diff_matrix_normed = normalize_accelerations_nfl( a=acc_diff_matrix, team=team, + ball_id=Constant.BALL, settings=settings, ) vect_to_player_matrix = p[:, None, :] - p[None, :, :] # NxNx2 diff --git a/unravel/american_football/graphs/features/node_features.py b/unravel/american_football/graphs/features/node_features.py index c723e21..dbf74f2 100644 --- a/unravel/american_football/graphs/features/node_features.py +++ b/unravel/american_football/graphs/features/node_features.py @@ -12,6 +12,8 @@ normalize_between, ) +from ..dataset import Constant + def compute_node_features( x, @@ -27,7 +29,7 @@ def compute_node_features( weight, settings, ): - ball_id = settings.ball_id + ball_id = Constant.BALL goal_mouth_position = ( settings.pitch_dimensions.x_dim.max, @@ -61,10 +63,12 @@ def compute_node_features( min_value=settings.pitch_dimensions.y_dim.min, ) uv_sa = unit_vector_from_angle(value=s, angle_radians=dir) - s_normed = normalize_speeds_nfl(s, team, settings) + s_normed = normalize_speeds_nfl(s, team, ball_id=Constant.BALL, settings=settings) uv_aa = unit_vector_from_angle(value=a, angle_radians=dir) - a_normed = normalize_accelerations_nfl(a, team, settings) + a_normed = normalize_accelerations_nfl( + a, team, ball_id=Constant.BALL, settings=settings + ) dir_sin_normed = normalize_sincos(np.nan_to_num(np.sin(dir))) dir_cos_normed = normalize_sincos(np.nan_to_num(np.cos(dir))) @@ -92,7 +96,7 @@ def compute_node_features( team == possession_team, 1, settings.defending_team_node_value ) is_qb = np.where( - official_position == settings.qb_id, # First condition + official_position == Constant.QB, # First condition 1, # If true, set to 1 (indicating the player is a QB) np.where( team == possession_team, # Second condition inside the else of the first diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py index 172164d..07b01e1 100644 --- a/unravel/american_football/graphs/graph_converter.py +++ b/unravel/american_football/graphs/graph_converter.py @@ -7,7 +7,7 @@ from spektral.data import Graph -from .dataset import BigDataBowlDataset +from .dataset import BigDataBowlDataset, Group, Column, Constant from .graph_settings import ( AmericanFootballGraphSettings, @@ -19,7 +19,7 @@ compute_adjacency_matrix, ) -from ...utils import DefaultGraphConverter, flatten_to_reshaped_array, make_sparse +from ...utils import DefaultGraphConverter, reshape_array, make_sparse @dataclass(repr=True) @@ -39,8 +39,6 @@ class AmericanFootballGraphConverter(DefaultGraphConverter): def __init__( self, dataset: BigDataBowlDataset, - label_col: str = "label", - graph_id_col: str = "graph_id", chunk_size: int = 2_000, attacking_non_qb_node_value: float = 0.1, **kwargs, @@ -50,12 +48,13 @@ def __init__( if not isinstance(dataset, BigDataBowlDataset): raise Exception("'dataset' should be an instance of BigDataBowlDataset") + self.label_col = dataset._label_column + self.graph_id_col = dataset._graph_id_column + self.dataset: pl.DataFrame = dataset.data self.pitch_dimensions: AmericanFootballPitchDimensions = ( dataset.pitch_dimensions ) - self.label_col = label_col - self.graph_id_col = graph_id_col self.chunk_size = chunk_size self.attacking_non_qb_node_value = attacking_non_qb_node_value @@ -108,163 +107,143 @@ def _apply_settings(self): verbose=self.verbose, ) - def _convert(self): - def __compute(args: List[pl.Series]) -> dict: - x = args[0].to_numpy() - y = args[1].to_numpy() - s = args[2].to_numpy() - a = args[3].to_numpy() - dis = args[4].to_numpy() - o = args[5].to_numpy() - dir = args[6].to_numpy() - team = args[7].to_numpy() - official_position = args[8].to_numpy() - possession_team = args[9].to_numpy() - height = args[10].to_numpy() - weight = args[11].to_numpy() - graph_id = args[12].to_numpy() - label = args[13].to_numpy() - - if not np.all(graph_id == graph_id[0]): - raise Exception( - "GraphId selection contains multiple different values. Make sure each GraphId is unique by at least playId and frameId..." - ) - - if not np.all(label == label[0]): - raise Exception( - "Label selection contains multiple different values for a single selection (group by) of playId and frameId, make sure this is not the case. Each group can only have 1 label." - ) - adjacency_matrix = compute_adjacency_matrix( - team=team, possession_team=possession_team, settings=self.settings - ) - edge_features = compute_edge_features( - adjacency_matrix=adjacency_matrix, - p=np.stack((x, y), axis=-1), - s=s, - a=a, - dir=dir, - o=o, # Shape will be (N, 2) - team=team, - settings=self.settings, + @property + def __exprs_variables(self): + return [ + Column.X, + Column.Y, + Column.SPEED, + Column.ACCELERATION, + Column.ORIENTATION, + Column.DIRECTION, + Column.TEAM, + Column.OFFICIAL_POSITION, + Column.POSSESSION_TEAM, + Column.HEIGHT_CM, + Column.WEIGHT_KG, + self.graph_id_col, + self.label_col, + ] + + def __compute(self, args: List[pl.Series]) -> dict: + d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} + + if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]): + raise Exception( + "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) - node_features = compute_node_features( - x, - y, - s=s, - a=a, - dir=dir, - o=o, - team=team, - official_position=official_position, - possession_team=possession_team, - height=height, - weight=weight, - settings=self.settings, + + if not self.prediction and not np.all( + d[self.label_col] == d[self.label_col][0] + ): + raise Exception( + """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, + make sure this is not the case. Each group can only have 1 label.""" ) - return { - "e": pl.Series( - [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) - ), - "x": pl.Series( - [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) - ), - "a": pl.Series( - [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32)) - ), - "e_shape_0": edge_features.shape[0], - "e_shape_1": edge_features.shape[1], - "x_shape_0": node_features.shape[0], - "x_shape_1": node_features.shape[1], - "a_shape_0": adjacency_matrix.shape[0], - "a_shape_1": adjacency_matrix.shape[1], - self.graph_id_col: graph_id[0], - self.label_col: label[0], - } - - result_df = self.dataset.group_by( - ["gameId", "playId", "frameId"], maintain_order=True - ).agg( - pl.map_groups( - exprs=[ - "x", - "y", - "s", - "a", - "dis", - "o", - "dir", - "team", - "officialPosition", - "possessionTeam", - "height_cm", - "weight_kg", - self.graph_id_col, - self.label_col, - ], - function=__compute, - ).alias("result_dict") - ) - graph_df = result_df.with_columns( - [ - pl.col("result_dict").struct.field("a").alias("a"), - pl.col("result_dict").struct.field("e").alias("e"), - pl.col("result_dict").struct.field("x").alias("x"), - pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"), - pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"), - pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"), - pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"), - pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"), - pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"), - pl.col("result_dict") - .struct.field(self.graph_id_col) - .alias(self.graph_id_col), - pl.col("result_dict") - .struct.field(self.label_col) - .alias(self.label_col), - ] + adjacency_matrix = compute_adjacency_matrix( + team=d[Column.TEAM], + possession_team=d[Column.POSSESSION_TEAM], + settings=self.settings, ) + edge_features = compute_edge_features( + adjacency_matrix=adjacency_matrix, + p=np.stack((d[Column.X], d[Column.Y]), axis=-1), + s=d[Column.SPEED], + a=d[Column.ACCELERATION], + dir=d[Column.DIRECTION], + o=d[Column.ORIENTATION], + team=d[Column.TEAM], + settings=self.settings, + ) + node_features = compute_node_features( + x=d[Column.X], + y=d[Column.Y], + s=d[Column.SPEED], + a=d[Column.ACCELERATION], + dir=d[Column.DIRECTION], + o=d[Column.ORIENTATION], + team=d[Column.TEAM], + official_position=d[Column.OFFICIAL_POSITION], + possession_team=d[Column.POSSESSION_TEAM], + height=d[Column.HEIGHT_CM], + weight=d[Column.WEIGHT_KG], + settings=self.settings, + ) + return { + "e": pl.Series( + [edge_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "x": pl.Series( + [node_features.tolist()], dtype=pl.List(pl.List(pl.Float64)) + ), + "a": pl.Series( + [adjacency_matrix.tolist()], dtype=pl.List(pl.List(pl.Int32)) + ), + "e_shape_0": edge_features.shape[0], + "e_shape_1": edge_features.shape[1], + "x_shape_0": node_features.shape[0], + "x_shape_1": node_features.shape[1], + "a_shape_0": adjacency_matrix.shape[0], + "a_shape_1": adjacency_matrix.shape[1], + self.graph_id_col: d[self.graph_id_col][0], + self.label_col: d[self.label_col][0], + } - return graph_df.drop("result_dict") - - def to_graph_frames(self) -> List[dict]: - def __convert_to_graph_data_list(df): - lazy_df = df.lazy() - - graph_list = [] - - for chunk in lazy_df.collect().iter_slices(self.chunk_size): - chunk_graph_list = [ - { - "a": make_sparse( - flatten_to_reshaped_array( - arr=chunk["a"][i], - s0=chunk["a_shape_0"][i], - s1=chunk["a_shape_1"][i], - ) - ), - "x": flatten_to_reshaped_array( - arr=chunk["x"][i], - s0=chunk["x_shape_0"][i], - s1=chunk["x_shape_1"][i], - ), - "e": flatten_to_reshaped_array( - arr=chunk["e"][i], - s0=chunk["e_shape_0"][i], - s1=chunk["e_shape_1"][i], - ), - "y": np.asarray([chunk[self.label_col][i]]), - "id": chunk[self.graph_id_col][i], - } - for i in range(len(chunk["a"])) + def _convert(self): + # Group and aggregate in one step + return ( + self.dataset.group_by(Group.BY_FRAME, maintain_order=True) + .agg( + pl.map_groups( + exprs=self.__exprs_variables, function=self.__compute + ).alias("result_dict") + ) + .with_columns( + [ + *[ + pl.col("result_dict").struct.field(f).alias(f) + for f in ["a", "e", "x", self.graph_id_col, self.label_col] + ], + *[ + pl.col("result_dict") + .struct.field(f"{m}_shape_{i}") + .alias(f"{m}_shape_{i}") + for m in ["a", "e", "x"] + for i in [0, 1] + ], ] - graph_list.extend(chunk_graph_list) + ) + .drop("result_dict") + ) - return graph_list + def to_graph_frames(self) -> List[dict]: + def process_chunk(chunk: pl.DataFrame) -> List[dict]: + return [ + { + "a": make_sparse( + reshape_array( + chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i] + ) + ), + "x": reshape_array( + chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i] + ), + "e": reshape_array( + chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i] + ), + "y": np.asarray([chunk[self.label_col][i]]), + "id": chunk[self.graph_id_col][i], + } + for i in range(len(chunk)) + ] graph_df = self._convert() - self.graph_frames = __convert_to_graph_data_list(graph_df) - + self.graph_frames = [ + graph + for chunk in graph_df.lazy().collect().iter_slices(self.chunk_size) + for graph in process_chunk(chunk) + ] return self.graph_frames def to_spektral_graphs(self) -> List[Graph]: diff --git a/unravel/american_football/graphs/graph_settings.py b/unravel/american_football/graphs/graph_settings.py index 5de30aa..9c96dfe 100644 --- a/unravel/american_football/graphs/graph_settings.py +++ b/unravel/american_football/graphs/graph_settings.py @@ -26,8 +26,6 @@ def __post_init__(self): @dataclass class AmericanFootballGraphSettings(DefaultGraphSettings): pitch_dimensions: AmericanFootballPitchDimensions = None - ball_id: str = "football" - qb_id: str = "QB" attacking_non_qb_node_value: float = 0.1 max_height: float = 225.0 # in cm min_height: float = 150.0 diff --git a/unravel/soccer/graphs/dataset.py b/unravel/soccer/graphs/dataset.py index e9d66b5..b15c452 100644 --- a/unravel/soccer/graphs/dataset.py +++ b/unravel/soccer/graphs/dataset.py @@ -41,12 +41,12 @@ class Column: Y = "y" Z = "z" - V = "v" + SPEED = "v" VX = "vx" VY = "vy" VZ = "vz" - A = "a" + ACCELERATION = "a" AX = "ax" AY = "ay" AZ = "az" @@ -67,14 +67,18 @@ class SoccerObject: @dataclass class KloppyPolarsDataset(DefaultDataset): - kloppy_dataset: TrackingDataset - ball_carrier_threshold: float = 25.0 - _graph_id_column: str = field(default="graph_id") - _label_column: str = field(default="label") - _overwrite_orientation: bool = field(default=False, init=False) - _infer_goalkeepers: bool = field(default=False, init=False) - - def __post_init__(self): + def __init__( + self, + kloppy_dataset: TrackingDataset, + ball_carrier_threshold: float = 25.0, + **kwargs, + ): + super().__init__(**kwargs) + self.kloppy_dataset = kloppy_dataset + self.ball_carrier_threshold = ball_carrier_threshold + self._overwrite_orientation: bool = False + self._infer_goalkeepers: bool = False + if not isinstance(self.kloppy_dataset, TrackingDataset): raise Exception("'kloppy_dataset' should be of type float") @@ -284,7 +288,7 @@ def __add_velocity( + pl.col(Column.VZ) ** 2 ) .sqrt() - .alias(Column.V) + .alias(Column.SPEED) ] ) @@ -325,7 +329,7 @@ def __add_acceleration(self, df: pl.DataFrame): + pl.col(Column.AZ) ** 2 ) .sqrt() - .alias(Column.A) + .alias(Column.ACCELERATION) ] ) ) diff --git a/unravel/soccer/graphs/features/edge_features_pl.py b/unravel/soccer/graphs/features/edge_features_pl.py index 3852e6d..ce4defe 100644 --- a/unravel/soccer/graphs/features/edge_features_pl.py +++ b/unravel/soccer/graphs/features/edge_features_pl.py @@ -20,6 +20,8 @@ normalize_accelerations_nfl, ) +from ..dataset import Constant + def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, settings): # Compute pairwise distances using broadcasting @@ -39,6 +41,7 @@ def compute_edge_features_pl(adjacency_matrix, p3d, p2d, s, velocity, team, sett speed_diff_matrix_normed = normalize_speed_differences_nfl( s=speed_diff_matrix, team=team, + ball_id=Constant.BALL, settings=settings, ) diff --git a/unravel/soccer/graphs/features/node_features_pl.py b/unravel/soccer/graphs/features/node_features_pl.py index 804ebd4..c95d8b2 100644 --- a/unravel/soccer/graphs/features/node_features_pl.py +++ b/unravel/soccer/graphs/features/node_features_pl.py @@ -18,6 +18,7 @@ normalize_speed, distance_to_ball, ) +from ..dataset import Constant def compute_node_features_pl( @@ -31,7 +32,7 @@ def compute_node_features_pl( ball_carrier, settings, ): - ball_id = settings.ball_id + ball_id = Constant.BALL goal_mouth_position = ( settings.pitch_dimensions.x_dim.max, @@ -60,7 +61,7 @@ def compute_node_features_pl( max_value=settings.pitch_dimensions.y_dim.max, min_value=settings.pitch_dimensions.y_dim.min, ) - s_normed = normalize_speeds_nfl(s, team, settings) + s_normed = normalize_speeds_nfl(s, team, ball_id=Constant.BALL, settings=settings) uv_velocity = unit_vectors(velocity) angles = normalize_angles(np.arctan2(uv_velocity[:, 1], uv_velocity[:, 0])) diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index 077e296..50d07b6 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -1,34 +1,16 @@ import logging import sys -from copy import deepcopy -import pandas as pd - -import warnings - -from dataclasses import dataclass, field, asdict +from dataclasses import dataclass from typing import List, Union, Dict, Literal, Any from kloppy.domain import ( - TrackingDataset, - Frame, - Orientation, - DatasetTransformer, - DatasetFlag, - SecondSpectrumCoordinateSystem, MetricPitchDimensions, ) from spektral.data import Graph -from .exceptions import ( - MissingLabelsError, - MissingDatasetError, - IncorrectDatasetTypeError, - KeyMismatchError, -) - from .graph_settings_pl import GraphSettingsPolars from .dataset import KloppyPolarsDataset, Column, Group, Constant from .features import ( @@ -106,11 +88,11 @@ def _apply_padding(self) -> pl.DataFrame: Column.VX, Column.VY, Column.VZ, - Column.V, + Column.SPEED, Column.AX, Column.AY, Column.AZ, - Column.A, + Column.ACCELERATION, ] group_by_columns = [ Column.GAME_ID, @@ -214,29 +196,29 @@ def _apply_filters(self): return self.dataset.with_columns( pl.when( (pl.col(Column.OBJECT_ID) == Constant.BALL) - & (pl.col(Column.V) > self.settings.max_ball_speed) + & (pl.col(Column.SPEED) > self.settings.max_ball_speed) ) .then(self.settings.max_ball_speed) .when( (pl.col(Column.OBJECT_ID) != Constant.BALL) - & (pl.col(Column.V) > self.settings.max_player_speed) + & (pl.col(Column.SPEED) > self.settings.max_player_speed) ) .then(self.settings.max_player_speed) - .otherwise(pl.col(Column.V)) - .alias(Column.V) + .otherwise(pl.col(Column.SPEED)) + .alias(Column.SPEED) ).with_columns( pl.when( (pl.col(Column.OBJECT_ID) == Constant.BALL) - & (pl.col(Column.A) > self.settings.max_ball_acceleration) + & (pl.col(Column.ACCELERATION) > self.settings.max_ball_acceleration) ) .then(self.settings.max_ball_acceleration) .when( (pl.col(Column.OBJECT_ID) != Constant.BALL) - & (pl.col(Column.A) > self.settings.max_player_acceleration) + & (pl.col(Column.ACCELERATION) > self.settings.max_player_acceleration) ) .then(self.settings.max_player_acceleration) - .otherwise(pl.col(Column.A)) - .alias(Column.A) + .otherwise(pl.col(Column.ACCELERATION)) + .alias(Column.ACCELERATION) ) def _apply_settings(self): @@ -290,11 +272,11 @@ def __exprs_variables(self): Column.X, Column.Y, Column.Z, - Column.V, + Column.SPEED, Column.VX, Column.VY, Column.VZ, - Column.A, + Column.ACCELERATION, Column.AX, Column.AY, Column.AZ, @@ -321,6 +303,7 @@ def __compute(self, args: List[pl.Series]) -> dict: """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, make sure this is not the case. Each group can only have 1 label.""" ) + ball_carriers = np.where(d[Column.IS_BALL_CARRIER] == True)[0] if len(ball_carriers) == 0: ball_carrier_idx = None @@ -339,7 +322,7 @@ def __compute(self, args: List[pl.Series]) -> dict: adjacency_matrix=adjacency_matrix, p3d=np.stack((d[Column.X], d[Column.Y], d[Column.Z]), axis=-1), p2d=np.stack((d[Column.X], d[Column.Y]), axis=-1), - s=d[Column.V], + s=d[Column.SPEED], velocity=velocity, team=d[Column.TEAM_ID], settings=self.settings, @@ -348,7 +331,7 @@ def __compute(self, args: List[pl.Series]) -> dict: node_features = compute_node_features_pl( d[Column.X], d[Column.Y], - s=d[Column.V], + s=d[Column.SPEED], velocity=velocity, team=d[Column.TEAM_ID], possession_team=d[Column.BALL_OWNING_TEAM_ID], @@ -377,77 +360,6 @@ def __compute(self, args: List[pl.Series]) -> dict: self.label_col: d[self.label_col][0], } - # def _convert(self): - # result_df = self.dataset.group_by(Group.BY_FRAME, maintain_order=True).agg( - # pl.map_groups( - # exprs=self.__exprs_variables, - # function=self.__compute, - # ).alias("result_dict") - # ) - - # graph_df = result_df.with_columns( - # [ - # pl.col("result_dict").struct.field("a").alias("a"), - # pl.col("result_dict").struct.field("e").alias("e"), - # pl.col("result_dict").struct.field("x").alias("x"), - # pl.col("result_dict").struct.field("e_shape_0").alias("e_shape_0"), - # pl.col("result_dict").struct.field("e_shape_1").alias("e_shape_1"), - # pl.col("result_dict").struct.field("x_shape_0").alias("x_shape_0"), - # pl.col("result_dict").struct.field("x_shape_1").alias("x_shape_1"), - # pl.col("result_dict").struct.field("a_shape_0").alias("a_shape_0"), - # pl.col("result_dict").struct.field("a_shape_1").alias("a_shape_1"), - # pl.col("result_dict") - # .struct.field(self.graph_id_col) - # .alias(self.graph_id_col), - # pl.col("result_dict") - # .struct.field(self.label_col) - # .alias(self.label_col), - # ] - # ) - - # return graph_df.drop("result_dict") - - # def to_graph_frames(self) -> List[dict]: - # def __convert_to_graph_data_list(df): - # lazy_df = df.lazy() - - # graph_list = [] - - # for chunk in lazy_df.collect().iter_slices(self.chunk_size): - # chunk_graph_list = [ - # { - # "a": make_sparse( - # flatten_to_reshaped_array( - # arr=chunk["a"][i], - # s0=chunk["a_shape_0"][i], - # s1=chunk["a_shape_1"][i], - # ) - # ), - # "x": flatten_to_reshaped_array( - # arr=chunk["x"][i], - # s0=chunk["x_shape_0"][i], - # s1=chunk["x_shape_1"][i], - # ), - # "e": flatten_to_reshaped_array( - # arr=chunk["e"][i], - # s0=chunk["e_shape_0"][i], - # s1=chunk["e_shape_1"][i], - # ), - # "y": np.asarray([chunk[self.label_col][i]]), - # "id": chunk[self.graph_id_col][i], - # } - # for i in range(len(chunk["a"])) - # ] - # graph_list.extend(chunk_graph_list) - - # return graph_list - - # graph_df = self._convert() - # self.graph_frames = __convert_to_graph_data_list(graph_df) - - # return self.graph_frames - - ### def _convert(self): # Group and aggregate in one step return ( @@ -475,23 +387,19 @@ def _convert(self): .drop("result_dict") ) - @staticmethod - def _reshape_array(arr, s0, s1): - return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1) - def to_graph_frames(self) -> List[dict]: def process_chunk(chunk: pl.DataFrame) -> List[dict]: return [ { "a": make_sparse( - self._reshape_array( + reshape_array( chunk["a"][i], chunk["a_shape_0"][i], chunk["a_shape_1"][i] ) ), - "x": self._reshape_array( + "x": reshape_array( chunk["x"][i], chunk["x_shape_0"][i], chunk["x_shape_1"][i] ), - "e": self._reshape_array( + "e": reshape_array( chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i] ), "y": np.asarray([chunk[self.label_col][i]]), @@ -508,8 +416,6 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]: ] return self.graph_frames - ### - def to_spektral_graphs(self) -> List[Graph]: if not self.graph_frames: self.to_graph_frames() diff --git a/unravel/utils/features/utils.py b/unravel/utils/features/utils.py index c11e8f3..ef89bc2 100644 --- a/unravel/utils/features/utils.py +++ b/unravel/utils/features/utils.py @@ -146,8 +146,8 @@ def normalize_acceleration(value, max_acceleration): return np.clip(x, -1, 1) -def normalize_speeds_nfl(s, team, settings): - ball_mask = team == settings.ball_id +def normalize_speeds_nfl(s, team, ball_id, settings): + ball_mask = team == ball_id s_normed = np.zeros_like(s) s_normed[ball_mask] = normalize_speed(s[ball_mask], settings.max_ball_speed) @@ -156,13 +156,13 @@ def normalize_speeds_nfl(s, team, settings): return s_normed -def normalize_speed_differences_nfl(s, team, settings): +def normalize_speed_differences_nfl(s, team, ball_id, settings): - return normalize_speeds_nfl(s, team, settings) * np.sign(s) + return normalize_speeds_nfl(s, team, ball_id, settings) * np.sign(s) -def normalize_accelerations_nfl(a, team, settings): - ball_mask = team == settings.ball_id +def normalize_accelerations_nfl(a, team, ball_id, settings): + ball_mask = team == ball_id a_normed = np.zeros_like(a) a_normed[ball_mask] = normalize_acceleration( @@ -183,6 +183,10 @@ def flatten_to_reshaped_array(arr, s0, s1, as_list=False): return result_array if not as_list else result_array.tolist() +def reshape_array(arr, s0, s1): + return np.array([item for sublist in arr for item in sublist]).reshape(s0, s1) + + def distance_to_ball( x: np.array, y: np.array, team: np.array, ball_id: str, z: np.array = None ): diff --git a/unravel/utils/objects/default_dataset.py b/unravel/utils/objects/default_dataset.py index b31280e..17ad9b6 100644 --- a/unravel/utils/objects/default_dataset.py +++ b/unravel/utils/objects/default_dataset.py @@ -1,8 +1,11 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field @dataclass class DefaultDataset: + _graph_id_column: str = field(default="graph_id") + _label_column: str = field(default="label") + def load(self): raise NotImplementedError() diff --git a/unravel/utils/objects/default_graph_converter.py b/unravel/utils/objects/default_graph_converter.py index dd3f3f5..79bc16e 100644 --- a/unravel/utils/objects/default_graph_converter.py +++ b/unravel/utils/objects/default_graph_converter.py @@ -152,6 +152,9 @@ def __post_init__(self): if not isinstance(self.verbose, bool): raise Exception("'verbose' should be of type boolean (bool)") + def _shuffle(self): + raise NotImplementedError() + def _sport_specific_checks(self): raise NotImplementedError( "No sport specific checks implementend... Make sure to check for existens of labels of some sort, and graph ids of some sort..." From 08e79c398220359521e21106a8cd9a03cb05c822 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 13:38:51 +0100 Subject: [PATCH 06/10] deprecation warning --- unravel/soccer/graphs/graph_converter.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py index 31c093b..d57cb5c 100644 --- a/unravel/soccer/graphs/graph_converter.py +++ b/unravel/soccer/graphs/graph_converter.py @@ -4,7 +4,7 @@ from scipy.spatial.qhull import QhullError -import warnings +from warnings import warn, simplefilter from dataclasses import dataclass, field, asdict @@ -33,6 +33,9 @@ from ...utils import * +simplefilter("always", DeprecationWarning) + + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) stdout_handler = logging.StreamHandler(sys.stdout) @@ -80,6 +83,14 @@ class SoccerGraphConverter(DefaultGraphConverter): non_potential_receiver_node_value: float = 0.1 def __post_init__(self): + warn( + """ + This class is deprecated and will be removed in a future release. Please use SoccerGraphConverterPolars for better performance. + Note: SoccerGraphConverterPolars is not one-to-one compatible with models and dataset created from SoccerGraphConverter due to breaking changes. + """, + category=DeprecationWarning, + stacklevel=2, + ) if not self.dataset: raise Exception("Please provide a 'kloppy' dataset.") @@ -204,7 +215,7 @@ def _convert(self, frame: Frame): if not self.prediction and label is None: if self.settings.verbose: - warnings.warn( + warn( f"""No label for frame={frame.frame_id} in 'labels'...""", NoLabelWarning, ) From ce14e38a78f89c846f117c4ba873572e4bb0b3b4 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 13:50:51 +0100 Subject: [PATCH 07/10] kloppy 3.16 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 72295da..089acfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy==1.26.4 spektral==1.2.0 -kloppy==3.15.0 +kloppy==3.16.0 tensorflow>=2.14.0; platform_machine != 'arm64' or platform_system != 'Darwin' tensorflow-macos>=2.14.0; platform_machine == 'arm64' and platform_system == 'Darwin' keras==2.14.0 diff --git a/setup.py b/setup.py index 9f4fa76..9ec6b79 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def read_version(): python_requires="~=3.11", install_requires=[ "spektral==1.2.0", - "kloppy==3.15.0", + "kloppy==3.16.0", "tensorflow>=2.14.0;platform_machine != 'arm64' or platform_system != 'Darwin'", "tensorflow-macos>=2.14.0;platform_machine == 'arm64' and platform_system == 'Darwin'", "keras==2.14.0", From 676014ebfcd6831e001dc1e7f6c848bb8a54f6b6 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Sun, 26 Jan 2025 17:26:23 +0100 Subject: [PATCH 08/10] minor --- unravel/__init__.py | 2 +- unravel/american_football/graphs/dataset.py | 2 +- .../graphs/graph_converter.py | 42 +++++++++++------ unravel/soccer/graphs/graph_converter.py | 1 - unravel/soccer/graphs/graph_converter_pl.py | 46 ++++++++++++------- .../utils/objects/default_graph_converter.py | 3 ++ 6 files changed, 61 insertions(+), 35 deletions(-) diff --git a/unravel/__init__.py b/unravel/__init__.py index b0cda09..b235f04 100644 --- a/unravel/__init__.py +++ b/unravel/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.0" +__version__ = "0.3.0" from .soccer import * from .american_football import * diff --git a/unravel/american_football/graphs/dataset.py b/unravel/american_football/graphs/dataset.py index 4b8ccff..5273b4a 100644 --- a/unravel/american_football/graphs/dataset.py +++ b/unravel/american_football/graphs/dataset.py @@ -125,7 +125,7 @@ def load(self): separator=",", encoding="utf8", null_values=["NA", "NULL", ""], - dtypes={"birthDate": pl.Date}, + schema_overrides={"birthDate": pl.Date}, ignore_errors=True, ) if "position" in players.columns: diff --git a/unravel/american_football/graphs/graph_converter.py b/unravel/american_football/graphs/graph_converter.py index 07b01e1..5f899b7 100644 --- a/unravel/american_football/graphs/graph_converter.py +++ b/unravel/american_football/graphs/graph_converter.py @@ -48,8 +48,14 @@ def __init__( if not isinstance(dataset, BigDataBowlDataset): raise Exception("'dataset' should be an instance of BigDataBowlDataset") - self.label_col = dataset._label_column - self.graph_id_col = dataset._graph_id_column + self.label_column: str = ( + self.label_col if self.label_col is not None else dataset._label_column + ) + self.graph_id_column: str = ( + self.graph_id_col + if self.graph_id_col is not None + else dataset._graph_id_column + ) self.dataset: pl.DataFrame = dataset.data self.pitch_dimensions: AmericanFootballPitchDimensions = ( @@ -64,21 +70,21 @@ def __init__( def _sport_specific_checks(self): - if not isinstance(self.label_col, str): + if not isinstance(self.label_column, str): raise Exception("'label_col' should be of type string (str)") - if not isinstance(self.graph_id_col, str): + if not isinstance(self.graph_id_column, str): raise Exception("'graph_id_col' should be of type string (str)") if not isinstance(self.chunk_size, int): raise Exception("chunk_size should be of type integer (int)") - if not self.label_col in self.dataset.columns and not self.prediction: + if not self.label_column in self.dataset.columns and not self.prediction: raise Exception( "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on." ) - if not self.graph_id_col in self.dataset.columns: + if not self.graph_id_column in self.dataset.columns: raise Exception( "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." ) @@ -121,20 +127,20 @@ def __exprs_variables(self): Column.POSSESSION_TEAM, Column.HEIGHT_CM, Column.WEIGHT_KG, - self.graph_id_col, - self.label_col, + self.graph_id_column, + self.label_column, ] def __compute(self, args: List[pl.Series]) -> dict: d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} - if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]): + if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]): raise Exception( "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) if not self.prediction and not np.all( - d[self.label_col] == d[self.label_col][0] + d[self.label_column] == d[self.label_column][0] ): raise Exception( """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, @@ -186,8 +192,8 @@ def __compute(self, args: List[pl.Series]) -> dict: "x_shape_1": node_features.shape[1], "a_shape_0": adjacency_matrix.shape[0], "a_shape_1": adjacency_matrix.shape[1], - self.graph_id_col: d[self.graph_id_col][0], - self.label_col: d[self.label_col][0], + self.graph_id_column: d[self.graph_id_column][0], + self.label_column: d[self.label_column][0], } def _convert(self): @@ -203,7 +209,13 @@ def _convert(self): [ *[ pl.col("result_dict").struct.field(f).alias(f) - for f in ["a", "e", "x", self.graph_id_col, self.label_col] + for f in [ + "a", + "e", + "x", + self.graph_id_column, + self.label_column, + ] ], *[ pl.col("result_dict") @@ -232,8 +244,8 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]: "e": reshape_array( chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i] ), - "y": np.asarray([chunk[self.label_col][i]]), - "id": chunk[self.graph_id_col][i], + "y": np.asarray([chunk[self.label_column][i]]), + "id": chunk[self.graph_id_column][i], } for i in range(len(chunk)) ] diff --git a/unravel/soccer/graphs/graph_converter.py b/unravel/soccer/graphs/graph_converter.py index d57cb5c..1262598 100644 --- a/unravel/soccer/graphs/graph_converter.py +++ b/unravel/soccer/graphs/graph_converter.py @@ -71,7 +71,6 @@ class SoccerGraphConverter(DefaultGraphConverter): dataset: TrackingDataset = None labels: dict = None - labels: dict = None graph_id: Union[str, int, dict] = None graph_ids: dict = None diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index 50d07b6..5beeb82 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -45,8 +45,14 @@ class SoccerGraphConverterPolars(DefaultGraphConverter): def __post_init__(self): self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions - self.label_col = self.dataset._label_column - self.graph_id_col = self.dataset._graph_id_column + self.label_column: str = ( + self.label_col if self.label_col is not None else self.dataset._label_column + ) + self.graph_id_column: str = ( + self.graph_id_col + if self.graph_id_col is not None + else self.dataset._graph_id_column + ) self.dataset = self.dataset.data @@ -76,8 +82,8 @@ def _apply_padding(self) -> pl.DataFrame: Column.TIMESTAMP, Column.BALL_STATE, Column.POSITION_NAME, - self.label_col, - self.graph_id_col, + self.label_column, + self.graph_id_column, ] empty_columns = [ Column.OBJECT_ID, @@ -240,21 +246,21 @@ def _apply_settings(self): ) def _sport_specific_checks(self): - if not isinstance(self.label_col, str): + if not isinstance(self.label_column, str): raise Exception("'label_col' should be of type string (str)") - if not isinstance(self.graph_id_col, str): + if not isinstance(self.graph_id_column, str): raise Exception("'graph_id_col' should be of type string (str)") if not isinstance(self.chunk_size, int): raise Exception("chunk_size should be of type integer (int)") - if not self.label_col in self.dataset.columns and not self.prediction: + if not self.label_column in self.dataset.columns and not self.prediction: raise Exception( "Please specify a 'label_col' and add that column to your 'dataset' or set 'prediction=True' if you want to use the converted dataset to make predictions on." ) - if not self.graph_id_col in self.dataset.columns: + if not self.graph_id_column in self.dataset.columns: raise Exception( "Please specify a 'graph_id_col' and add that column to your 'dataset' ..." ) @@ -284,20 +290,20 @@ def __exprs_variables(self): Column.POSITION_NAME, Column.BALL_OWNING_TEAM_ID, Column.IS_BALL_CARRIER, - self.graph_id_col, - self.label_col, + self.graph_id_column, + self.label_column, ] def __compute(self, args: List[pl.Series]) -> dict: d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} - if not np.all(d[self.graph_id_col] == d[self.graph_id_col][0]): + if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]): raise Exception( "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) if not self.prediction and not np.all( - d[self.label_col] == d[self.label_col][0] + d[self.label_column] == d[self.label_column][0] ): raise Exception( """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, @@ -356,8 +362,8 @@ def __compute(self, args: List[pl.Series]) -> dict: "x_shape_1": node_features.shape[1], "a_shape_0": adjacency_matrix.shape[0], "a_shape_1": adjacency_matrix.shape[1], - self.graph_id_col: d[self.graph_id_col][0], - self.label_col: d[self.label_col][0], + self.graph_id_column: d[self.graph_id_column][0], + self.label_column: d[self.label_column][0], } def _convert(self): @@ -373,7 +379,13 @@ def _convert(self): [ *[ pl.col("result_dict").struct.field(f).alias(f) - for f in ["a", "e", "x", self.graph_id_col, self.label_col] + for f in [ + "a", + "e", + "x", + self.graph_id_column, + self.label_column, + ] ], *[ pl.col("result_dict") @@ -402,8 +414,8 @@ def process_chunk(chunk: pl.DataFrame) -> List[dict]: "e": reshape_array( chunk["e"][i], chunk["e_shape_0"][i], chunk["e_shape_1"][i] ), - "y": np.asarray([chunk[self.label_col][i]]), - "id": chunk[self.graph_id_col][i], + "y": np.asarray([chunk[self.label_column][i]]), + "id": chunk[self.graph_id_column][i], } for i in range(len(chunk)) ] diff --git a/unravel/utils/objects/default_graph_converter.py b/unravel/utils/objects/default_graph_converter.py index 79bc16e..dfc9133 100644 --- a/unravel/utils/objects/default_graph_converter.py +++ b/unravel/utils/objects/default_graph_converter.py @@ -87,6 +87,9 @@ class DefaultGraphConverter: pad: bool = False verbose: bool = False + label_col: str = None + graph_id_col: str = None + graph_frames: dict = field(init=False, repr=False, default=None) settings: DefaultGraphSettings = field( init=False, repr=False, default_factory=DefaultGraphSettings From 2b4e6a84234f531966c5984bbdf9e97a4cb780f4 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Mon, 27 Jan 2025 09:35:04 +0100 Subject: [PATCH 09/10] updated examples --- examples/1_kloppy_gnn_train.ipynb | 143 ++-- examples/deprecated/1_kloppy_gnn_train.ipynb | 794 +++++++++++++++++++ examples/graphs_faq.md | 6 +- unravel/soccer/graphs/graph_converter_pl.py | 9 +- 4 files changed, 896 insertions(+), 56 deletions(-) create mode 100644 examples/deprecated/1_kloppy_gnn_train.ipynb diff --git a/examples/1_kloppy_gnn_train.ipynb b/examples/1_kloppy_gnn_train.ipynb index be6bd2e..39aee88 100644 --- a/examples/1_kloppy_gnn_train.ipynb +++ b/examples/1_kloppy_gnn_train.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 🌀 unravel kloppy into graph neural network!\n", + "## 🌀 unravel kloppy into graph neural network using the _new_ Polars back-end!\n", "\n", "First run `pip install unravelsports` if you haven't already!\n", "\n", @@ -25,11 +25,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library.\n", + "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library, and a newly added (version==0.3.0+) [Polars](https://pola.rs/) back-end.\n", "\n", "This walkthrough will touch on a lot of the concepts from [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn). It is strongly advised to first read the [research paper (pdf)](https://ussf-ssac-23-soccer-gnn.s3.us-east-2.amazonaws.com/public/Sahasrabudhe_Bekkers_SSAC23.pdf). Some concepts are also explained in the [Graphs FAQ](graphs_faq.md).\n", "\n", - "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n", + "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into a `KloppyPolarsDataset`, convert it into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n", "\n", "The powerful Kloppy package allows us to load and standardize data from many providers: Metrica, Sportec, Tracab, SecondSpectrum, StatsPerform and SkillCorner. In this guide we'll use some matches from the [Public SkillCorner Dataset](https://github.com/SkillCorner/opendata).\n", "\n", @@ -42,7 +42,7 @@ "\n", "- [**1. Imports**](#1-imports).\n", "- [**2. Public SkillCorner Data**](#2-public-skillcorner-data).\n", - "- [**3. Graph Converter**](#2-open-skillcorner-data).\n", + "- [**3. ⭐ _KloppyPolarsDataset_ and _SoccerGraphConverterPolars_**](#2-open-skillcorner-data).\n", "- [**4. Load Kloppy Data, Convert & Store**](#4-load-kloppy-data-convert-and-store).\n", "- [**5. Creating a Custom Graph Dataset**](#5-creating-a-custom-graph-dataset).\n", "- [**6. Prepare for Training**](#6-prepare-for-training).\n", @@ -68,18 +68,18 @@ "source": [ "### 1. Imports\n", "\n", - "We import `SoccerGraphConverter` to help us convert from Kloppy positional tracking frames to graphs.\n", + "We import `SoccerGraphConverterPolars` to help us convert from Kloppy positional tracking frames to graphs.\n", "\n", "With the power of **Kloppy** we can also load data from many providers by importing `metrica`, `sportec`, `tracab`, `secondspectrum`, or `statsperform` from `kloppy`." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "from unravel.soccer import SoccerGraphConverter\n", + "from unravel.soccer import SoccerGraphConverterPolars, KloppyPolarsDataset\n", "\n", "from kloppy import skillcorner" ] @@ -128,33 +128,73 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Graph Converter\n", + "### 3. ⭐ _KloppyPolarsDataset_ and _SoccerGraphConverterPolars_\n", "\n", "ℹī¸ For more information on:\n", "- What a Graph is, check out [Graph FAQ Section A](graphs_faq.ipynb)\n", - "- What parameters we can pass to the `SoccerGraphConverter`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n", + "- What parameters we can pass to the `SoccerGraphConverterPolars`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n", "- What features each Graph has, check out [Graph FAQ Section C](graphs_faq.ipynb)\n", "\n", - "---\n", + "------\n", "\n", - "To get started with the `SoccerGraphConverter` we need to pass one _required_ parameter:\n", - "- `dataset` (of type `TrackingDataset` (Kloppy)) \n", + "To get started we need to load our tracking data using Kloppy, and subsequently pass this to the `KloppyPolarsDataset`. This `KloppyPolarsDataset` also takes the `ball_carrier_threshold` parameter.\n", "\n", - "And one parameter that's required when we're converting for training purposes (more on this later):\n", - "- `labels` (a dictionary with `frame_id`s as keys and a value of `{True, False, 1 or 0}`).\n", - "```python\n", - "{83340: True, 83341: False, etc..} = {83340: 1, 83341: 0, etc..} = {83340: 1, 83341: False, etc..}\n", - "```\n", - "⚠ī¸ As mentioned before you will need to create your own labels! In this example we'll use `dummy_labels(dataset)` to generate a fake label for each frame.\n", + "🗒ī¸ KloppyPolarsDataset sets the orientation to `Orientation.BALL_OWNING_TEAM` (ball owning team plays left to right). Except when we don't know who the ball owning team is. This can happen when a data provider does not provide the ball owning team information.\n", + "If our dataset does not have the ball owning team we infer the ball owning team automatically using the `ball_carrier_threshold` and subsequently change the orientation automatically to be left to right for the ball owning team too.\n", + "In `SoccerGraphConverter` [deprecated] if the ball owning team was not available we set the orientation to STATIC_HOME_AWAY meaning attacking could happen in two directions. \n", + "\n", + "
\n", + "
\n",
+    "kloppy_dataset = skillcorner.load_open_data(\n",
+    "    match_id=match_id,\n",
+    "    coordinates=\"secondspectrum\",\n",
+    "    include_empty_frames=False,\n",
+    "    limit=500,  \n",
+    ")\n",
+    "kloppy_polars_dataset = KloppyPolarsDataset(\n",
+    "    kloppy_dataset=kloppy_dataset,\n",
+    "    ball_carrier_threshold=25.0\n",
+    ")\n",
+    "kloppy_polars_dataset.load()\n",
+    "
\n", + "
\n", "\n", "#### Graph Identifier(s):\n", - "When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. To make this simple, there are two _optional_ parameters we can pass to `SoccerGraphConverter`, namely:\n", - "- `graph_id`. This is a single identifier (str or int) for a whole match, for example the unique match id.\n", - "- `graph_ids`. This is a dictionary with the same keys as `labels`, but the values are now the unique identifiers. This option can be used if we want to split by sequence or possession_id. For example: {frame_id: 'matchId-sequenceId', frame_id: 'match_Id-sequenceId2'} etc. You will need to create your own ids. Note, if `labels` and `graph_ids` don't have the exact same keys it will throw an error.\n", + "After loading the `kloppy_polars_dataset` we now add graph identifiers. We can do this by passing a list of column names on which we want to split our data.\n", + "\n", + "🗒ī¸ When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information).\n", + "\n", + "\n", + "
\n", + "
\n",
+    "kloppy_polars_dataset.add_graph_ids(by=[\"game_id\", \"period_id\"])\n",
+    "
\n", + "
\n", + "\n", + "#### Graph Labels\n", "\n", - "In this example we'll use the `graph_id=match_id` as the unique identifier, but feel free to change that for `graph_ids=dummy_graph_ids(dataset)` to test out that behavior.\n", + "Now, we can add our (binary) labels to the dataset. In all examples we do this using `kloppy_polars_dataset.add_dummy_labels()`, but these are random labels and will not help with training.\n", "\n", - "Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information)." + "To add useful labels for your task you need to \"join\" a Polars dataframe that contains a column with the required labels to the `kloppy_polars_dataset.data` Polars dataframe. Please note that in this dataframe each row is a single player (or ball) object, and thus each `frame_id` has 23 rows (if all players and ball are observed). All these rows (for a single frame_id) need to have _the same_ label. If your label column is not named `\"label\"` you need to pass the `label_col` (str) parameter to `SoccerGraphConverterPolars`.\n", + "\n", + "
\n", + "
\n",
+    "kloppy_polars_dataset.data = (\n",
+    "    kloppy_polars_dataset.data\n",
+    "    .join(\n",
+    "        some_label_dataframe.select([\"game_id\", \"period_id\", \"frame_id\", \"label\"]), \n",
+    "        on=[\"game_id\", \"period_id\", \"frame_id\"],\n",
+    "        how=\"left\"\n",
+    "    )\n",
+    "
\n", + "
\n", + "\n", + "### SoccerGraphConverterPolars\n", + "\n", + "To get started with the `SoccerGraphConverterPolars` we need to pass one _required_ parameter:\n", + "- `dataset` (of type `KloppyPolarsDataset`) \n", + "\n", + "For a full list of other parameters we can pass to the `SoccerGraphConverterPolars`, check out [Graph FAQ Section B](graphs_faq.ipynb)" ] }, { @@ -174,9 +214,7 @@ "As mentioned in [Section 2](#2-public-skillcorner-data) we will use 4 matches of SkillCorner data. In the below example we will load the first 500 frames of data from each of these 4 games (we set `limit=500`) to create a dataset of 2,000 samples (Note: We're going to actually have less than 2,000 samples because setting `include_empty_frames=False` means we'll skip some frames in our conversion step).\n", "\n", "Important things to note:\n", - "- We import `dummy_labels` to randomly generate binary labels. Training with these random labels will not create a good model.\n", - "- We import `dummy_graph_ids` to generate fake graph labels.\n", - "- The `SoccerGraphConverter` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n", + "- The `SoccerGraphConverterPolars` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n", "- We will end up with fewer than 2,000 eventhough we set `limit=500` frames because we set `include_empty_frames=False` and all frames without ball coordinates are automatically ommited.\n", "- When using other providers always set `include_empty_frames=False` or `only_alive=True`.\n", "- We store the data as individual compressed pickle files, one file for per match. The data that gets stored in the pickle is a list of dictionaries, one dictionary per frame. Each dictionary has keys for the adjacency matrix, node features, edge features, label and graph id." @@ -184,25 +222,35 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Processing frames: 100%|██████████| 500/500 [00:02<00:00, 244.81it/s]\n", - "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.65it/s]\n", - "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 343.58it/s] \n", - "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.17it/s]\n" + "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n", + " This operation dropped 8 incomplete frames out of 488 total frames (1.64%)\n", + " \n", + " warnings.warn(\n", + "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n", + " This operation dropped 96 incomplete frames out of 487 total frames (19.71%)\n", + " \n", + " warnings.warn(\n", + "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n", + " This operation dropped 156 incomplete frames out of 494 total frames (31.58%)\n", + " \n", + " warnings.warn(\n", + "/Users/jbekkers/PycharmProjects/unravelsports/.venv311/lib/python3.11/site-packages/unravel/soccer/graphs/graph_converter_pl.py:187: UserWarning: Setting pad=True drops frames that do not have at least 1 object for the attacking team, defending team or ball.\n", + " This operation dropped 87 incomplete frames out of 500 total frames (17.40%)\n", + " \n", + " warnings.warn(\n" ] } ], "source": [ "from os.path import exists\n", "\n", - "from unravel.utils import dummy_labels, dummy_graph_ids\n", - "\n", "match_ids = [4039, 3749, 3518, 3442]\n", "pickle_folder = \"pickles\"\n", "compressed_pickle_file_path = \"{pickle_folder}/{match_id}.pickle.gz\"\n", @@ -213,33 +261,33 @@ " )\n", " # if the output file already exists, skip this whole step\n", " if not exists(match_pickle_file_path):\n", - "\n", " # Load Kloppy dataset\n", - " dataset = skillcorner.load_open_data(\n", + " kloppy_dataset = skillcorner.load_open_data(\n", " match_id=match_id,\n", " coordinates=\"secondspectrum\",\n", " include_empty_frames=False,\n", - " limit=500, # limit to 500 frames in this example\n", + " limit=500, \n", + " )\n", + " dataset = KloppyPolarsDataset(\n", + " kloppy_dataset=kloppy_dataset,\n", + " ball_carrier_threshold=25.0\n", " )\n", + " dataset.load()\n", + " \n", + " dataset.add_graph_ids()\n", + " \n", + " dataset.add_dummy_labels()\n", "\n", " # Initialize the Graph Converter, with dataset, labels and settings\n", - " converter = SoccerGraphConverter(\n", + " converter = SoccerGraphConverterPolars(\n", " dataset=dataset,\n", - " # create fake labels\n", - " labels=dummy_labels(dataset),\n", - " graph_id=match_id,\n", - " # graph_ids=dummy_graph_ids(dataset),\n", " # Settings\n", - " ball_carrier_treshold=25.0,\n", " max_player_speed=12.0,\n", " max_ball_speed=28.0,\n", - " boundary_correction=None,\n", " self_loop_ball=True,\n", " adjacency_matrix_connect_type=\"ball\",\n", " adjacency_matrix_type=\"split_by_team\",\n", " label_type=\"binary\",\n", - " infer_ball_ownership=True,\n", - " infer_goalkeepers=True,\n", " defending_team_node_value=0.1,\n", " non_potential_receiver_node_value=0.1,\n", " random_seed=False,\n", @@ -254,7 +302,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "ℹī¸ For a full table of parameters we can pass to the `SoccerGraphConverter` check out [Graph FAQ Section B](graphs_faq.ipynb)\n", "\n", "-----" ] @@ -303,7 +350,7 @@ "Our `dataset` object has two custom methods to help split the data into train, test and validation sets.\n", "Either use `dataset.split_test_train()` if we don't need a validation set, or `dataset.split_test_train_validation()` if we do also require a validation set.\n", "\n", - "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverter` using the 'graph_id' or 'graph_ids' parameter.\n", + "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverterPolars` using the 'graph_id' or 'graph_ids' parameter.\n", "\n", "The 'split_train', 'split_test' and 'split_validation' parameters can either be ratios, percentages or relative size compared to total. \n", "\n", @@ -786,7 +833,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.11" } }, "nbformat": 4, diff --git a/examples/deprecated/1_kloppy_gnn_train.ipynb b/examples/deprecated/1_kloppy_gnn_train.ipynb new file mode 100644 index 0000000..b7a3089 --- /dev/null +++ b/examples/deprecated/1_kloppy_gnn_train.ipynb @@ -0,0 +1,794 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 🌀 unravel kloppy into graph neural network!\n", + "\n", + "First run `pip install unravelsports` if you haven't already!\n", + "\n", + "\n", + "-----\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install unravelsports --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this in-depth walkthrough we'll discuss everything the `unravelsports` package has to offer for converting a [Kloppy](https://github.com/PySport/kloppy) dataset of soccer tracking data into graphs for training binary classification graph neural networks using the [Spektral](https://graphneural.network/) library.\n", + "\n", + "This walkthrough will touch on a lot of the concepts from [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn). It is strongly advised to first read the [research paper (pdf)](https://ussf-ssac-23-soccer-gnn.s3.us-east-2.amazonaws.com/public/Sahasrabudhe_Bekkers_SSAC23.pdf). Some concepts are also explained in the [Graphs FAQ](graphs_faq.md).\n", + "\n", + "Step by step we'll show how this package can be used to load soccer positional (tracking) data with `kloppy`, how to convert this data into \"graphs\", train a Graph Neural Network with `spektral`, evaluate it's performance, save and load the model and finally apply the model to unseen data to make predictions.\n", + "\n", + "The powerful Kloppy package allows us to load and standardize data from many providers: Metrica, Sportec, Tracab, SecondSpectrum, StatsPerform and SkillCorner. In this guide we'll use some matches from the [Public SkillCorner Dataset](https://github.com/SkillCorner/opendata).\n", + "\n", + "
\n", + "Before we get started it is important to note that the unravelsports library does not have built in functionality to create binary labels, these will need to be supplied by the reader. In this example we use the dummy_labels() functionality that comes with the package. This function creates a single binary label for each frame by randomly assigning it a 0 or 1 value.\n", + "\n", + "
\n", + "\n", + "##### **Contents**\n", + "\n", + "- [**1. Imports**](#1-imports).\n", + "- [**2. Public SkillCorner Data**](#2-public-skillcorner-data).\n", + "- [**3. Graph Converter**](#2-open-skillcorner-data).\n", + "- [**4. Load Kloppy Data, Convert & Store**](#4-load-kloppy-data-convert-and-store).\n", + "- [**5. Creating a Custom Graph Dataset**](#5-creating-a-custom-graph-dataset).\n", + "- [**6. Prepare for Training**](#6-prepare-for-training).\n", + " - [6.1 Split Dataset](#61-split-dataset)\n", + " - [6.2 Model Configurations](#62-model-configurations)\n", + " - [6.3 Build GNN Model](#63-build-gnn-model)\n", + " - [6.4 Create DataLoaders](#64-create-dataloaders)\n", + "- [**7. GNN Training + Prediction**](#7-training-and-prediction).\n", + " - [7.1 Compile Model](#71-compile-model)\n", + " - [7.2 Fit Model](#72-fit-model)\n", + " - [7.3 Save & Load Model](#73-save--load-model)\n", + " - [7.4 Evaluate Model](#74-evaluate-model)\n", + " - [7.5 Predict on New Data](#75-predict-on-new-data)\n", + "\n", + "ℹī¸ [**Graphs FAQ**](graphs_faq.md)\n", + "\n", + "-----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Imports\n", + "\n", + "We import `SoccerGraphConverter` to help us convert from Kloppy positional tracking frames to graphs.\n", + "\n", + "With the power of **Kloppy** we can also load data from many providers by importing `metrica`, `sportec`, `tracab`, `secondspectrum`, or `statsperform` from `kloppy`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from unravel.soccer import SoccerGraphConverter\n", + "\n", + "from kloppy import skillcorner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Public SkillCorner Data\n", + "\n", + "The `SoccerGraphConverter` class allows processing data from every tracking data provider supported by [PySports Kloppy](https://github.com/PySport/kloppy), namely:\n", + "- Sportec\n", + "- Tracab\n", + "- SecondSpectrum\n", + "- SkillCorner\n", + "- StatsPerform\n", + "- Metrica\n", + "\n", + "In this example we're going to use a sample of tracking data from 4 matches of [publicly available SkillCorner data](https://github.com/SkillCorner/opendata). \n", + "\n", + "All we need to know for now is that this data is from the following matches:\n", + "\n", + "| id | date_time | home_team | away_team |\n", + "|---:|:---------------------:|:-----------------------|:-----------------------|\n", + "| 4039 | 2020-07-02T19:15:00Z | Manchester City | Liverpool |\n", + "| 3749 | 2020-05-26T16:30:00Z | Dortmund | Bayern Munchen |\n", + "| 3518 | 2020-03-08T19:45:00Z | Juventus | Inter |\n", + "| 3442 | 2020-03-01T20:00:00Z | Real Madrid | FC Barcelona |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "-----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Graph Converter\n", + "\n", + "ℹī¸ For more information on:\n", + "- What a Graph is, check out [Graph FAQ Section A](graphs_faq.ipynb)\n", + "- What parameters we can pass to the `SoccerGraphConverter`, check out [Graph FAQ Section B](graphs_faq.ipynb)\n", + "- What features each Graph has, check out [Graph FAQ Section C](graphs_faq.ipynb)\n", + "\n", + "---\n", + "\n", + "To get started with the `SoccerGraphConverter` we need to pass one _required_ parameter:\n", + "- `dataset` (of type `TrackingDataset` (Kloppy)) \n", + "\n", + "And one parameter that's required when we're converting for training purposes (more on this later):\n", + "- `labels` (a dictionary with `frame_id`s as keys and a value of `{True, False, 1 or 0}`).\n", + "```python\n", + "{83340: True, 83341: False, etc..} = {83340: 1, 83341: 0, etc..} = {83340: 1, 83341: False, etc..}\n", + "```\n", + "⚠ī¸ As mentioned before you will need to create your own labels! In this example we'll use `dummy_labels(dataset)` to generate a fake label for each frame.\n", + "\n", + "#### Graph Identifier(s):\n", + "When training a model on tracking data it's highly recommended to split data into test/train(/validation) sets by match or period such that all data end up in the same test, train or validation set. This should be done to avoid leaking information between test, train and validation sets. To make this simple, there are two _optional_ parameters we can pass to `SoccerGraphConverter`, namely:\n", + "- `graph_id`. This is a single identifier (str or int) for a whole match, for example the unique match id.\n", + "- `graph_ids`. This is a dictionary with the same keys as `labels`, but the values are now the unique identifiers. This option can be used if we want to split by sequence or possession_id. For example: {frame_id: 'matchId-sequenceId', frame_id: 'match_Id-sequenceId2'} etc. You will need to create your own ids. Note, if `labels` and `graph_ids` don't have the exact same keys it will throw an error.\n", + "\n", + "In this example we'll use the `graph_id=match_id` as the unique identifier, but feel free to change that for `graph_ids=dummy_graph_ids(dataset)` to test out that behavior.\n", + "\n", + "Correctly splitting the final dataset in train, test and validiation sets using these Graph Identifiers is incorporated into `CustomSpektralDataset` (see [Section 6.1](#61-split-dataset) for more information)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### 4. Load Kloppy Data, Convert and Store\n", + "\n", + "As mentioned in [Section 2](#2-public-skillcorner-data) we will use 4 matches of SkillCorner data. In the below example we will load the first 500 frames of data from each of these 4 games (we set `limit=500`) to create a dataset of 2,000 samples (Note: We're going to actually have less than 2,000 samples because setting `include_empty_frames=False` means we'll skip some frames in our conversion step).\n", + "\n", + "Important things to note:\n", + "- We import `dummy_labels` to randomly generate binary labels. Training with these random labels will not create a good model.\n", + "- We import `dummy_graph_ids` to generate fake graph labels.\n", + "- The `SoccerGraphConverter` handles all necessary steps (like setting the correct coordinate system, and left-right normalization).\n", + "- We will end up with fewer than 2,000 eventhough we set `limit=500` frames because we set `include_empty_frames=False` and all frames without ball coordinates are automatically ommited.\n", + "- When using other providers always set `include_empty_frames=False` or `only_alive=True`.\n", + "- We store the data as individual compressed pickle files, one file for per match. The data that gets stored in the pickle is a list of dictionaries, one dictionary per frame. Each dictionary has keys for the adjacency matrix, node features, edge features, label and graph id." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing frames: 100%|██████████| 500/500 [00:02<00:00, 244.81it/s]\n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.65it/s]\n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 343.58it/s] \n", + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 285.17it/s]\n" + ] + } + ], + "source": [ + "from os.path import exists\n", + "\n", + "from unravel.utils import dummy_labels, dummy_graph_ids\n", + "\n", + "match_ids = [4039, 3749, 3518, 3442]\n", + "pickle_folder = \"pickles\"\n", + "compressed_pickle_file_path = \"{pickle_folder}/{match_id}.pickle.gz\"\n", + "\n", + "for match_id in match_ids:\n", + " match_pickle_file_path = compressed_pickle_file_path.format(\n", + " pickle_folder=pickle_folder, match_id=match_id\n", + " )\n", + " # if the output file already exists, skip this whole step\n", + " if not exists(match_pickle_file_path):\n", + "\n", + " # Load Kloppy dataset\n", + " dataset = skillcorner.load_open_data(\n", + " match_id=match_id,\n", + " coordinates=\"secondspectrum\",\n", + " include_empty_frames=False,\n", + " limit=500, # limit to 500 frames in this example\n", + " )\n", + "\n", + " # Initialize the Graph Converter, with dataset, labels and settings\n", + " converter = SoccerGraphConverter(\n", + " dataset=dataset,\n", + " # create fake labels\n", + " labels=dummy_labels(dataset),\n", + " graph_id=match_id,\n", + " # graph_ids=dummy_graph_ids(dataset),\n", + " # Settings\n", + " ball_carrier_treshold=25.0,\n", + " max_player_speed=12.0,\n", + " max_ball_speed=28.0,\n", + " boundary_correction=None,\n", + " self_loop_ball=True,\n", + " adjacency_matrix_connect_type=\"ball\",\n", + " adjacency_matrix_type=\"split_by_team\",\n", + " label_type=\"binary\",\n", + " infer_ball_ownership=True,\n", + " infer_goalkeepers=True,\n", + " defending_team_node_value=0.1,\n", + " non_potential_receiver_node_value=0.1,\n", + " random_seed=False,\n", + " pad=True,\n", + " verbose=False,\n", + " )\n", + " # Compute the graphs and directly store them as a pickle file\n", + " converter.to_pickle(file_path=match_pickle_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ℹī¸ For a full table of parameters we can pass to the `SoccerGraphConverter` check out [Graph FAQ Section B](graphs_faq.ipynb)\n", + "\n", + "-----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Creating a Custom Graph Dataset\n", + "\n", + "To easily train our model with the Spektral library we need to use a Spektral dataset object. The `CustomSpektralDataset` class helps us create such an object really easily.\n", + "\n", + "- `CustomSpektralDataset` is a [`spektral.data.Dataset`](https://graphneural.network/creating-dataset/). \n", + "This type of dataset makes it very easy to properly load, train and predict with a Spektral GNN.\n", + "- The `CustomSpektralDataset` has an option to load from a folder of compressed pickle files, all we have to do is pass the pickle_folder location.\n", + "\n", + "ℹī¸ For more information on the `CustomSpektralDataset` please check the [Graphs FAQ Section D](graphs_faq.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from unravel.utils import CustomSpektralDataset\n", + "\n", + "dataset = CustomSpektralDataset(pickle_folder=pickle_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Prepare for Training\n", + "\n", + "Now that we have all the data converted into Graphs inside our `CustomSpektralDataset` object, we can prepare to train the GNN model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.1 Split Dataset\n", + "\n", + "Our `dataset` object has two custom methods to help split the data into train, test and validation sets.\n", + "Either use `dataset.split_test_train()` if we don't need a validation set, or `dataset.split_test_train_validation()` if we do also require a validation set.\n", + "\n", + "We can split our data 'by_graph_id' if we have provided Graph Ids in our `SoccerGraphConverter` using the 'graph_id' or 'graph_ids' parameter.\n", + "\n", + "The 'split_train', 'split_test' and 'split_validation' parameters can either be ratios, percentages or relative size compared to total. \n", + "\n", + "We opt to create a test, train _and_ validation set to use in our example." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train: CustomSpektralDataset(n_graphs=791)\n", + "Test: CustomSpektralDataset(n_graphs=477)\n", + "Validation: CustomSpektralDataset(n_graphs=336)\n" + ] + } + ], + "source": [ + "train, test, val = dataset.split_test_train_validation(\n", + " split_train=4, split_test=1, split_validation=1, by_graph_id=True, random_seed=42\n", + ")\n", + "print(\"Train:\", train)\n", + "print(\"Test:\", test)\n", + "print(\"Validation:\", val)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🗒ī¸ We can see that, because we are splitting by only 4 different graph_ids here (the 4 match_ids) the ratio's aren't perfectly 4 to 1 to 1. If you change the `graph_id=match_id` parameter in the `SoccerGraphConverter` to `graph_ids=dummy_graph_ids(dataset)` you'll see that it's easier to get close to the correct ratios, simply because we have a lot more graph_ids to split a cross. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.2 Model Configurations" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "learning_rate = 1e-3\n", + "epochs = 5 # Increase for actual training\n", + "batch_size = 32\n", + "channels = 128\n", + "n_layers = 3 # Number of CrystalConv layers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.3 Build GNN Model\n", + "\n", + "This GNN Model has the same architecture as described in [A Graph Neural Network Deep-dive into Successful Counterattacks {A. Sahasrabudhe & J. Bekkers}](https://github.com/USSoccerFederation/ussf_ssac_23_soccer_gnn/tree/main)\n", + "\n", + "This exact model can also simply be loaded as:\n", + "\n", + "`from unravel.classifiers import CrystalGraphClassifier` as shown in [Quick Start Guide](0_quick_start_guide.ipynb)\n", + "\n", + "Below we show the exact same code to make it easier to adjust." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from spektral.layers import GlobalAvgPool, CrystalConv\n", + "from tensorflow.keras.layers import Dense, Dropout\n", + "from tensorflow.keras.models import Model\n", + "\n", + "\n", + "class CrystalGraphClassifier(Model):\n", + " def __init__(\n", + " self,\n", + " n_layers: int = 3,\n", + " channels: int = 128,\n", + " drop_out: float = 0.5,\n", + " n_out: int = 1,\n", + " **kwargs\n", + " ):\n", + " super().__init__(**kwargs)\n", + "\n", + " self.n_layers = n_layers\n", + " self.channels = channels\n", + " self.drop_out = drop_out\n", + " self.n_out = n_out\n", + "\n", + " self.conv1 = CrystalConv()\n", + " self.convs = [CrystalConv() for _ in range(1, self.n_layers)]\n", + " self.pool = GlobalAvgPool()\n", + " self.dense1 = Dense(self.channels, activation=\"relu\")\n", + " self.dropout = Dropout(self.drop_out)\n", + " self.dense2 = Dense(self.channels, activation=\"relu\")\n", + " self.dense3 = Dense(self.n_out, activation=\"sigmoid\")\n", + "\n", + " def call(self, inputs):\n", + " x, a, e, i = inputs\n", + " x = self.conv1([x, a, e])\n", + " for conv in self.convs:\n", + " x = conv([x, a, e])\n", + " x = self.pool([x, i])\n", + " x = self.dense1(x)\n", + " x = self.dropout(x)\n", + " x = self.dense2(x)\n", + " x = self.dropout(x)\n", + " return self.dense3(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.4 Create DataLoaders\n", + "\n", + "Create a Spektral [`DisjointLoader`](https://graphneural.network/loaders/#disjointloader). This DisjointLoader will help us to load batches of Disjoint Graphs for training purposes.\n", + "\n", + "Note that these Spektral `Loaders` return a generator, so if we want to retrain the model, we need to reload these loaders." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from spektral.data import DisjointLoader\n", + "\n", + "loader_tr = DisjointLoader(train, batch_size=batch_size, epochs=epochs)\n", + "loader_va = DisjointLoader(val, epochs=1, shuffle=False, batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "--------" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Training and Prediction\n", + "\n", + "Below we outline how to train the model, make predictions and add the predicted values back to the Kloppy dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.1 Compile Model\n", + "\n", + "1. Initialize the `CrystalGraphClassifier` (or create your own Graph Classifier).\n", + "2. Compile the model with a loss function, optimizer and your preferred metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras.metrics import AUC, BinaryAccuracy\n", + "from tensorflow.keras.losses import BinaryCrossentropy\n", + "from tensorflow.keras.optimizers import Adam\n", + "from tensorflow.keras.callbacks import EarlyStopping\n", + "\n", + "model = CrystalGraphClassifier()\n", + "\n", + "model.compile(\n", + " loss=BinaryCrossentropy(), optimizer=Adam(), metrics=[AUC(), BinaryAccuracy()]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.2 Fit Model\n", + "\n", + "1. We have a a [`DisjointLoader`](https://graphneural.network/loaders/#disjointloader) for training and validation sets.\n", + "2. Fit the model. \n", + "3. We add `EarlyStopping` and a `validation_data` dataset to monitor performance, and set `use_multiprocessing=True` to improve training speed.\n", + "\n", + "⚠ī¸ When trying to fit the model _again_ make sure to reload Data Loaders in [Section 6.4](#64-create-dataloaders), because they are generators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.fit(\n", + " loader_tr.load(),\n", + " steps_per_epoch=loader_tr.steps_per_epoch,\n", + " epochs=5,\n", + " use_multiprocessing=True,\n", + " validation_data=loader_va.load(),\n", + " callbacks=[EarlyStopping(monitor=\"loss\", patience=5, restore_best_weights=True)],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.3 Save & Load Model\n", + "\n", + "This step is solely included to show how to restore a model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tensorflow.keras.models import load_model\n", + "\n", + "model_path = \"models/my-first-graph-classifier\"\n", + "model.save(model_path)\n", + "loaded_model = load_model(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.4 Evaluate Model\n", + "\n", + "1. Create another `DisjointLoader`, this time for the test set.\n", + "2. Evaluate model performance on the test set. This evaluation function uses the `metrics` passed to `model.compile`\n", + "\n", + "🗒ī¸ Our performance is really bad because we're using random labels, very few epochs and a small dataset.\n", + "\n", + "📖 For more information on evaluation in sports analytics see: [Methodology and evaluation in sports analytics: challenges, approaches, and lessons learned {J. Davis et. al. (2024)}](https://link.springer.com/article/10.1007/s10994-024-06585-0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15/15 [==============================] - 0s 4ms/step - loss: 0.7250 - auc: 0.5309 - binary_accuracy: 0.5241\n" + ] + } + ], + "source": [ + "loader_te = DisjointLoader(test, epochs=1, shuffle=False, batch_size=batch_size)\n", + "results = model.evaluate(loader_te.load())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7.5 Predict on New Data\n", + "\n", + "1. Load new, unseen data from the SkillCorner dataset.\n", + "2. Convert this data, making sure we use the exact same settings as in step 1.\n", + "3. If we set `prediction=True` we do not have to supply labels to the `SoccerGraphConverter`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "kloppy_dataset = skillcorner.load_open_data(\n", + " match_id=2068, # A game we have not yet used in section 4\n", + " include_empty_frames=False,\n", + " limit=500,\n", + ")\n", + "\n", + "preds_converter = SoccerGraphConverter(\n", + " dataset=kloppy_dataset,\n", + " prediction=True,\n", + " ball_carrier_treshold=25.0,\n", + " max_player_speed=12.0,\n", + " max_ball_speed=28.0,\n", + " boundary_correction=None,\n", + " self_loop_ball=True,\n", + " adjacency_matrix_connect_type=\"ball\",\n", + " adjacency_matrix_type=\"split_by_team\",\n", + " label_type=\"binary\",\n", + " infer_ball_ownership=True,\n", + " infer_goalkeepers=True,\n", + " defending_team_node_value=0.1,\n", + " non_potential_receiver_node_value=0.1,\n", + " random_seed=False,\n", + " pad=True,\n", + " verbose=False,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Make a prediction on all the frames of this dataset using `model.predict`" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing frames: 100%|██████████| 500/500 [00:01<00:00, 326.02it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11/11 [==============================] - 0s 4ms/step\n" + ] + } + ], + "source": [ + "# Compute the graphs and add them to the CustomSpektralDataset\n", + "pred_dataset = CustomSpektralDataset(graphs=preds_converter.to_spektral_graphs())\n", + "\n", + "loader_pred = DisjointLoader(\n", + " pred_dataset, batch_size=batch_size, epochs=1, shuffle=False\n", + ")\n", + "preds = model.predict(loader_pred.load(), use_multiprocessing=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5. Convert Klopy dataset to a dataframe and merge back the pedictions using the frame_ids." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
frame_idperiod_idtimestampy
300216610 days 00:00:33.3000000.259016
301216710 days 00:00:33.4000000.251124
302216810 days 00:00:33.5000000.258305
303216910 days 00:00:33.6000000.256378
304217010 days 00:00:33.7000000.305434
\n", + "
" + ], + "text/plain": [ + " frame_id period_id timestamp y\n", + "300 2166 1 0 days 00:00:33.300000 0.259016\n", + "301 2167 1 0 days 00:00:33.400000 0.251124\n", + "302 2168 1 0 days 00:00:33.500000 0.258305\n", + "303 2169 1 0 days 00:00:33.600000 0.256378\n", + "304 2170 1 0 days 00:00:33.700000 0.305434" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "kloppy_df = kloppy_dataset.to_df()\n", + "\n", + "preds_df = pd.DataFrame(\n", + " {\"frame_id\": [x.id for x in pred_dataset], \"y\": preds.flatten()}\n", + ")\n", + "\n", + "kloppy_df = pd.merge(kloppy_df, preds_df, on=\"frame_id\", how=\"left\")\n", + "\n", + "kloppy_df[300:305][[\"frame_id\", \"period_id\", \"timestamp\", \"y\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "🗒ī¸ Not all frames have a prediction because of missing (ball) data, so we look at the 300th-305th frame." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/graphs_faq.md b/examples/graphs_faq.md index 8ea6495..fa50b53 100644 --- a/examples/graphs_faq.md +++ b/examples/graphs_faq.md @@ -51,10 +51,6 @@ In section 6.1 we can see what this looks like in Python. | `max_ball_acceleration` | float | The maximum speed of the ball in yards per second squared. Used for normalizing node features. | 10.0 | 🏈 | | `attacking_non_qb_node_value` | float | Value for the node feature when player is NOT the QB, but is on the attacking team | 0.1 | 🏈 | | `chunk_size` | int | Set to determine size of conversions from Polars to Graphs. Preferred setting depends on available computing power | 2_000 | 🏈 | -| `ball_carrier_threshold` | float | The distance threshold to determine the ball carrier in meters. If no ball carrier within ball_carrier_threshold, we skip the frame. | 25.0 | âšŊ | -| `boundary_correction` | float | A correction factor for boundary calculations, used to correct out of bounds as a percentage (Used as 1+boundary_correction, i.e., 0.05). Not setting this might lead to players outside the pitch markings to have values that fall slightly outside of our normalization range. When we set boundary_correction, any players outside the pitch will be moved to be on the closest line. | None | âšŊ | -| `infer_ball_ownership` | bool | Infers 'attacking_team' if no 'ball_owning_team' exist (in Kloppy TrackingDataset) by finding the player closest to the ball using ball xyz, uses 'ball_carrier_threshold' as a cut-off. | True | âšŊ | -| `infer_goalkeepers` | bool | Set True if no GK label is provided, set False for incomplete (broadcast tracking) data that might not have a GK in every frame. | True | âšŊ | | `non_potential_receiver_node_value` | float | Value for the node feature when player is NOT a potential receiver of a pass (when on opposing team or in possession of the ball). Should be between 0 and 1 including. | 0.1 | âšŊ | @@ -64,7 +60,7 @@ In section 6.1 we can see what this looks like in Python. #### C. What features does each Graph have?
- 🌀 âšŊ Expand for a full list of Soccer features + 🌀 âšŊ Expand for a full list of Soccer features (note: `SoccerGraphConverter`, `SoccerGraphConverterPolars` has slightly different features) | Variable | Datatype | Index | Features | |----------|-----------------------------------|-------|---------------------------------------------------------------------------------------------------------------------------------| diff --git a/unravel/soccer/graphs/graph_converter_pl.py b/unravel/soccer/graphs/graph_converter_pl.py index 5beeb82..703c57d 100644 --- a/unravel/soccer/graphs/graph_converter_pl.py +++ b/unravel/soccer/graphs/graph_converter_pl.py @@ -44,6 +44,9 @@ class SoccerGraphConverterPolars(DefaultGraphConverter): non_potential_receiver_node_value: float = 0.1 def __post_init__(self): + if not isinstance(self.dataset, KloppyPolarsDataset): + raise ValueError("dataset should be of type KloppyPolarsDataset...") + self.pitch_dimensions: MetricPitchDimensions = self.dataset.pitch_dimensions self.label_column: str = ( self.label_col if self.label_col is not None else self.dataset._label_column @@ -298,14 +301,14 @@ def __compute(self, args: List[pl.Series]) -> dict: d = {col: args[i].to_numpy() for i, col in enumerate(self.__exprs_variables)} if not np.all(d[self.graph_id_column] == d[self.graph_id_column][0]): - raise Exception( - "GraphId selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." + raise ValueError( + "graph_id selection contains multiple different values. Make sure each graph_id is unique by at least game_id and frame_id..." ) if not self.prediction and not np.all( d[self.label_column] == d[self.label_column][0] ): - raise Exception( + raise ValueError( """Label selection contains multiple different values for a single selection (group by) of game_id and frame_id, make sure this is not the case. Each group can only have 1 label.""" ) From 77ab8c2c0f1ff1e1eed531c5a1d5798708f8d6d9 Mon Sep 17 00:00:00 2001 From: "UnravelSports [JB]" Date: Mon, 27 Jan 2025 10:02:48 +0100 Subject: [PATCH 10/10] black jupyter --- examples/1_kloppy_gnn_train.ipynb | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/1_kloppy_gnn_train.ipynb b/examples/1_kloppy_gnn_train.ipynb index 39aee88..08fb650 100644 --- a/examples/1_kloppy_gnn_train.ipynb +++ b/examples/1_kloppy_gnn_train.ipynb @@ -266,16 +266,15 @@ " match_id=match_id,\n", " coordinates=\"secondspectrum\",\n", " include_empty_frames=False,\n", - " limit=500, \n", + " limit=500,\n", " )\n", " dataset = KloppyPolarsDataset(\n", - " kloppy_dataset=kloppy_dataset,\n", - " ball_carrier_threshold=25.0\n", + " kloppy_dataset=kloppy_dataset, ball_carrier_threshold=25.0\n", " )\n", " dataset.load()\n", - " \n", + "\n", " dataset.add_graph_ids()\n", - " \n", + "\n", " dataset.add_dummy_labels()\n", "\n", " # Initialize the Graph Converter, with dataset, labels and settings\n",