import numpy as np
import pandas as pd
import warnings
from scipy.spatial import distance_matrix
from scipy.spatial.distance import pdist, squareform
import tsfresh
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from geoalchemy2 import functions, elements
from .utils import presence_3d, angle
from tqdm import tqdm
import math
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import directed_hausdorff
import multiprocessing
from functools import partial
import re
from pandas.api.types import is_numeric_dtype
[docs]def grouping_data(processed_data, pick_vars=None, preprocessedMethod=False):
"""
Function to group data records by 'animal_id'. Adds additional attributes/columns, if features aren't extracted yet.
:param processed_data: pd.DataFrame with all preprocessed records.
:param preprocessedMethod: Boolean whether calling method is from preprocessing to check whether columns for features are added.
:return: dictionary with 'animal_id' as key and all records as value.
"""
# A dictionary object to hold all groups obtained using group by-
# Apply grouping using 'animal_id' attribute-
data_animal_id = processed_data.groupby('animal_id')
# A dictionary object to hold all groups obtained using group by-
data_animal_id_groups = {}
# Get each animal_id's data from grouping performed-
# for animal_id in data_animal_id.groups.keys():
# data_animal_id_groups[animal_id] = data_animal_id.get_group(animal_id)
for animal_id, df in data_animal_id:
data_animal_id_groups[animal_id] = df
# To reset index for each group-
for animal_id in data_animal_id_groups.keys():
data_animal_id_groups[animal_id].reset_index(drop=True, inplace=True)
if list(processed_data.columns.values) == list(['time', 'animal_id', 'x', 'y']) and not preprocessedMethod:
# Add additional attributes/columns to each groups-
for aid in data_animal_id_groups.keys():
data = [None for x in range(data_animal_id_groups[aid].shape[0])]
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
distance=data)
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
average_speed=data)
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
average_acceleration=data)
# data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
# positive_acceleration=data)
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
direction=data)
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
stopped=data)
if pick_vars != None:
for aid in data_animal_id_groups.keys():
data_animal_id_groups[aid] = data_animal_id_groups[
aid].loc[:, pick_vars]
return data_animal_id_groups
[docs]def timewise_dict(data):
"""
Group records by timestep.
:param data: pd.DataFrame with all preprocessed records.
:return: dictionary with 'time' as key and all animal records as value.
"""
# Input grouped into time
data_time = data.groupby('time')
# Pour into dictionary to hold data by time
dict_time = {}
for time in data_time.groups.keys():
dict_time[time] = data_time.get_group(time)
dict_time[time].reset_index(drop=True, inplace=True)
return dict_time
[docs]def regrouping_data(data_animal_id_groups):
"""
Concatenate all Pandas DataFrames in grouped dictionary into one.
:param data_animal_id_groups: dictionary ordered by 'animal_id'.
:return: Pandas DataFrame containing all records of all andimal_ids.
"""
# Concatenate all single records into one
result = pd.concat(data_animal_id_groups[aid]
for aid in data_animal_id_groups.keys())
result.sort_values(['time', 'animal_id'], ascending=True, inplace=True)
# Reset index-
result.reset_index(drop=True, inplace=True)
return result
[docs]def compute_direction_angle(data, param_x='x', param_y='y', colname='direction_angle'):
"""
Computes the angle of rotation of an animal between two timesteps. Only possible if coordinates are 2D only.
:param data: dataframe containing the movement records
:param param_x: column name of the x coordinate
:param param_y: column name of the y coordinate
:param colname: the name to appear in the new DataFrame for the direction angle computed.
:return: dataframe containing computed 'direction_angle' as angle from 0-360 degrees (x-axis to the right is 0 degrees)
"""
data_animal_id_groups = grouping_data(data)
# take the first dataframe to check the 3d presence
if presence_3d(data_animal_id_groups[next(iter(data_animal_id_groups))]):
is_3d = True
warnings.warn(
'The direction angle can only be calculated for two-dimensional coordinate data. The dataframe given to the function has more than two dimensions.')
else:
# iterate over movers
for aid in data_animal_id_groups.keys():
data_animal_id_groups[aid]['y_change'] = data_animal_id_groups[aid][param_y] - data_animal_id_groups[aid][
param_y].shift(periods=1) # change of y and x coordinate to create direction vector
data_animal_id_groups[aid]['x_change'] = data_animal_id_groups[aid][param_x] - data_animal_id_groups[aid][
param_x].shift(periods=1)
data_animal_id_groups[aid][colname] = data_animal_id_groups[aid]['y_change'] / data_animal_id_groups[aid][
'x_change'] # formula: tan^(-1) (y_change / x_change) = angle of direction change
data_animal_id_groups[aid][colname] = data_animal_id_groups[aid][colname].apply(
lambda x: math.degrees(math.atan(x))) # convert angle to degrees
data_animal_id_groups[aid].loc[(data_animal_id_groups[aid]['y_change'] >= 0) & (
data_animal_id_groups[aid]['x_change'] < 0), colname] = 180 + data_animal_id_groups[aid][
colname] # adjust to correct angle
data_animal_id_groups[aid].loc[(data_animal_id_groups[aid]['y_change'] < 0) & (
data_animal_id_groups[aid]['x_change'] >= 0), colname] = 360 + data_animal_id_groups[aid][
colname]
data_animal_id_groups[aid].loc[(data_animal_id_groups[aid]['y_change'] < 0) & (
data_animal_id_groups[aid]['x_change'] < 0), colname] = 180 + data_animal_id_groups[aid][
colname]
data_animal_id_groups[aid].loc[0, [colname]] = 0 # direction for first timestamp is 0
data_animal_id_groups[aid].drop(['y_change', 'x_change'], inplace=True, axis=1)
data = regrouping_data(data_animal_id_groups)
return data
[docs]def compute_turning_angle(data, colname='turning_angle', direction_angle_name='direction_angle'):
"""
Computes the turning angle for a mover between two timesteps as the difference of its direction angle. Only possible for 2D data.
:param data: dataframe containing the movement records.
:param colname: the name of the new column to be added.
:param direction_angle_name: the name of the column containg the direction angle for each movement record.
:return: dataframe containing an additional column with the difference in degrees between current and previous timestamp for each record.
Note that difference can not be higher than +-180 degrees.
"""
# when differences exceed |180| convert values so that they stay in the domain -180 +180
boundary = lambda data: 360 - data if data > 180 else -360 - data if data < -180 else data
vboundary = np.vectorize(boundary)
# check dataframe if data contains direction_angle column
if direction_angle_name not in data.columns:
warnings.warn('As it is needed to calculate the turning angle at first the direction angle has to be computed.')
data = compute_direction_angle(data)
data_animal_id_groups = grouping_data(data)
# iterate over movers
for aid in data_animal_id_groups.keys():
turning_angles = data_animal_id_groups[aid][direction_angle_name] - data_animal_id_groups[aid][
direction_angle_name].shift(
periods=1) # calculate difference in direction angle between current and previous timestamp
turning_angles = vboundary(turning_angles)
data_animal_id_groups[aid] = data_animal_id_groups[aid].assign(
inp=turning_angles)
data_animal_id_groups[aid] = data_animal_id_groups[aid].rename(
columns={'inp': colname})
data = regrouping_data(data_animal_id_groups)
return data
[docs]def compute_direction(data_animal_id_groups, pbar, param_x='x', param_y='y', param_z='z', colname='direction'):
"""
Computes the movement vector for each timestamp by checking the difference of the coordinates to the previous timestamp.
:param data_animal_id_groups: dictionary containing the data frames for each animal
:param pbar: percentage bar filled with 10% already
:param param_x: column name of the x coordinate
:param param_y: column name of the y coordinate
:param param_z: column name of the z coordinate
:param colname: the name to appear in the new DataFrame for the calculated direction
:return: dictionary containing computed 'direction' attribute
"""
percent_update = 90 / len(data_animal_id_groups.keys()) # how much the pb is updated after each animal
# take the first dataframe to check the 3d presence
if presence_3d(data_animal_id_groups[next(iter(data_animal_id_groups))]):
is_3d = True
else:
is_3d = False
# iterate over movers
for aid in data_animal_id_groups.keys():
if is_3d:
data_animal_id_groups[aid]['x_change'] = round(
data_animal_id_groups[aid][param_x] - data_animal_id_groups[aid][param_x].shift(periods=1), 4)
data_animal_id_groups[aid]['y_change'] = round(
data_animal_id_groups[aid][param_y] - data_animal_id_groups[aid][param_y].shift(periods=1), 4)
data_animal_id_groups[aid]['z_change'] = round(
data_animal_id_groups[aid][param_z] - data_animal_id_groups[aid][param_z].shift(periods=1), 4)
data_animal_id_groups[aid].loc[0, 'x_change'], data_animal_id_groups[aid].loc[0, 'y_change'], \
data_animal_id_groups[aid].loc[0, 'z_change'] = 0, 0, 0
data_animal_id_groups[aid][colname] = data_animal_id_groups[aid].apply(
lambda row: (row.x_change, row.y_change, row.z_change), axis=1)
data_animal_id_groups[aid].drop(['x_change', 'y_change', 'z_change'], inplace=True, axis=1)
else:
data_animal_id_groups[aid]['x_change'] = round(
data_animal_id_groups[aid][param_x] - data_animal_id_groups[aid][param_x].shift(periods=1), 4)
data_animal_id_groups[aid]['y_change'] = round(
data_animal_id_groups[aid][param_y] - data_animal_id_groups[aid][param_y].shift(periods=1), 4)
data_animal_id_groups[aid].loc[0, 'x_change'], data_animal_id_groups[aid].loc[0, 'y_change'] = 0, 0
data_animal_id_groups[aid][colname] = data_animal_id_groups[aid].apply(
lambda row: (row.x_change, row.y_change), axis=1)
data_animal_id_groups[aid].drop(['x_change', 'y_change'], inplace=True, axis=1)
pbar.update(percent_update)
return data_animal_id_groups
[docs]def compute_turning(data_animal_id_groups, param_direction="direction", colname="turning"):
"""
Computes the turning for a mover between two timesteps as the cosine similarity between its direction vectors.
:param data_animal_id_groups: dictionary ordered by 'animal_id'.
:param param_direction: Column name to be recognized as direction. Default "direction".
:param colname: the name of the new column added which contains the computed cosine similarity.
:return: data_animal_id_groups
"""
for aid in data_animal_id_groups.keys():
cos_similarities = [cosine_similarity(np.array([data_animal_id_groups[aid][param_direction][i]]),
np.array([data_animal_id_groups[aid][param_direction][i + 1]]))[0][0] for
i in range(0, len(data_animal_id_groups[aid][
param_direction]) - 1)] # cosine similarity for direction vectors of two following timestamps
cos_similarities.insert(0,
0) # first entry has no previous entry -> cosine similarity can not be calculated and value is set to 0
data_animal_id_groups[aid][colname] = cos_similarities
return data_animal_id_groups
[docs]def compute_distance(data_animal_id_groups, param_x="x", param_y="y", param_z="z"):
"""
Calculate metric distance of animals in between two timesteps.
:param data_animal_id_groups: dictionary ordered by 'animal_id'.
:param param_x: Column name to be recognized as x. Default "x".
:param param_y: Column name to be recognized as y. Default "y".
:param param_z: Column name to be recognized as z. Default "z".
:return: dictionary containing computed 'distance' attribute.
"""
# take the first dataframe to check the 3d presence
if presence_3d(data_animal_id_groups[next(iter(data_animal_id_groups))]):
for aid in data_animal_id_groups.keys():
p1 = data_animal_id_groups[aid].loc[:, [param_x, param_y, param_z]]
p2 = data_animal_id_groups[aid].loc[:, [param_x, param_y, param_z]].shift(
periods=1)
p2.iloc[0, :] = [0.0, 0.0, 0.0]
data_animal_id_groups[aid]['distance'] = ((p1 -
p2) ** 2).sum(axis=1) ** 0.5
# 2D dataset
else:
for aid in data_animal_id_groups.keys():
p1 = data_animal_id_groups[aid].loc[:, [param_x, param_y]]
p2 = data_animal_id_groups[aid].loc[:, [param_x, param_y]].shift(
periods=1)
p2.iloc[0, :] = [0.0, 0.0]
data_animal_id_groups[aid]['distance'] = ((p1 -
p2) ** 2).sum(axis=1) ** 0.5
# Reset first entry for each 'animal_id' to zero-
for aid in data_animal_id_groups.keys():
data_animal_id_groups[aid].loc[0, 'distance'] = 0.0
return data_animal_id_groups
[docs]def compute_average_speed(data_animal_id_groups, fps):
"""
Compute average speed of mover based on fps parameter. The formula used for calculating average speed is: (Total Distance traveled) / (Total time taken).
Size of traveling window is determined by fps parameter:
By choosing f.e. fps=4 at timestamp 5: (distance covered from timestamp 3 to timestamp 7) / 4.
By choosing f.e. fps=3 at timestamp 5: (distance covered from timestamp 3.5 to timestamp 6.5) / 3. (in this case use of interpolation if timestamps 3.5 and 6.5 do not exist.)
:param data_animal_id_groups: dictionary with 'animal_id' as keys.
:param fps: integer to define size of window for integer-formatted time or string to define size of window for datetime-formatted time (For possible units refer to:https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases.)
:return: dictionary, including measure for 'average_speed'.
"""
if (isinstance(fps, int) or isinstance(fps, float)):
fps = str(fps) + 'us' # as integer time is converted to time for flexible windows so is fps
for aid in data_animal_id_groups.keys():
# set index as time to have flexible sized windows
if is_numeric_dtype(data_animal_id_groups[aid]['time']):
data_animal_id_groups[aid].index = pd.to_datetime(data_animal_id_groups[aid]['time'], unit='us')
else:
data_animal_id_groups[aid].set_index('time', drop=False, inplace=True)
# add distances of window (will later be divided by time)
data_animal_id_groups[aid]['average_speed'] = data_animal_id_groups[
aid]['distance'].rolling(min_periods=1, window=fps,
center=True, closed='both').sum().fillna(0)
timedelta = pd.to_timedelta(fps) # size time window
timedelta_left = timedelta / 2 # size window left of observation
timedelta_right = timedelta / 2 # size window right of observation
# adjust first term of sum (as not entire distance of first entry in window was moved in time window)
ind = data_animal_id_groups[aid].index.searchsorted(data_animal_id_groups[aid].index - timedelta_left, side='left') # find index of first element in window
out_of_range = (ind <= 0)
ind[out_of_range] = 1 # otherwise key error
lag = data_animal_id_groups[aid]['distance'].values[ind]
lag[out_of_range] = 0 # set to 0 such that no distance is reduced from sum
data_animal_id_groups[aid]['lag'] = lag # each observation has stored distance of its windows first entry (which is then partially reduced)
data_animal_id_groups[aid]['average_speed'] = data_animal_id_groups[aid]['average_speed'] - \
((((data_animal_id_groups[aid].index - timedelta_left) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1])) * lag)
# add last term of sum (as some distance is covered in the time window which is recorded in first observation which is not in window
ind = data_animal_id_groups[aid].index.searchsorted(data_animal_id_groups[aid].index + timedelta_right, side='right') # find index of first element not in window
out_of_range = (ind >= data_animal_id_groups[aid].shape[0])
ind[out_of_range] = 0 # otherwise key error
lag = data_animal_id_groups[aid]['distance'].values[ind]
lag[out_of_range] = 0 # set to 0 such that no distance is added to sum
data_animal_id_groups[aid]['lag'] = lag # each observation has stored distance of first observation not in window (which is then partially added)
data_animal_id_groups[aid]['average_speed'] = data_animal_id_groups[aid]['average_speed'] + \
((((data_animal_id_groups[aid].index + timedelta_right) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1])) * lag)
data_animal_id_groups[aid].drop(['lag'], axis=1, inplace=True)
# divide sum of distances by time (here unit of fps is used)
data_animal_id_groups[aid]['average_speed'] = data_animal_id_groups[aid]['average_speed'] / float(re.search(r'(\d+)', fps).group(0))
# reset index
data_animal_id_groups[aid].reset_index(drop=True, inplace=True)
return data_animal_id_groups
[docs]def compute_average_acceleration(data_animal_id_groups, fps):
"""
Compute average acceleration of mover based on fps parameter. The formula used for calculating average acceleration is: (Final Speed - Initial Speed) / (Total Time Taken).
Size of traveling window is determined by fps parameter:
By choosing f.e. fps=4 at timestamp 5: (speed at timestamp 7 - speed at timestamp 3) / 4.
By choosing f.e. fps=3 at timestamp 5: (speed at timestamp 6.5 - speed at timestamp 3.5) / 3. (in this case use of interpolation if timestamps 3.5 and 6.5 do not exist.)
:param data_animal_id_groups: dictionary with 'animal_id' as keys.
:param fps: integer to define size of window for integer-formatted time or string to define size of window for datetime-formatted time (For possible units refer to:https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases.)
:return: dictionary, including measure for 'average_acceleration'.
"""
if (isinstance(fps, int) or isinstance(fps, float)):
fps = str(fps) + 'us' # as integer time is converted to time for flexible windows so is fps
for aid in data_animal_id_groups.keys():
# set index as time to have flexible sized windows
if is_numeric_dtype(data_animal_id_groups[aid]['time']):
data_animal_id_groups[aid].index = pd.to_datetime(data_animal_id_groups[aid]['time'], unit='us')
else:
data_animal_id_groups[aid].set_index('time', drop=False, inplace=True)
# add speed of first and last observation in window
data_animal_id_groups[aid]['start_speed'] = data_animal_id_groups[
aid]['average_speed'].rolling(min_periods=1, window=fps,
center=True, closed='both').apply(lambda x: x[0], raw=True).fillna(0)
data_animal_id_groups[aid]['end_speed'] = data_animal_id_groups[
aid]['average_speed'].rolling(min_periods=1, window=fps,
center=True, closed='both').apply(lambda x: x[-1], raw=True).fillna(0)
timedelta = pd.to_timedelta(fps) # size time window
timedelta_left = timedelta / 2 # size window left of observation
timedelta_right = timedelta / 2 # size window right of observation
# adjust start speed (as first observation prior to first observation in window is taken into account)
ind = data_animal_id_groups[aid].index.searchsorted(data_animal_id_groups[aid].index - timedelta_left, side='left') # find index of first element in window
out_of_range = (ind <= 0)
lag = data_animal_id_groups[aid]['average_speed'].values[ind-1] # store respective value of first observation prior to window
lag[out_of_range] = data_animal_id_groups[aid]['average_speed'][0] # set to speed of first observation
data_animal_id_groups[aid]['lag'] = lag # each observation has stored speed of first element prior to window
# start speed is calculated as weighted sum of speed from first observation prior to window and from first observation in window
data_animal_id_groups[aid]['start_speed'] = ((((data_animal_id_groups[aid].index - timedelta_left) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1])) * data_animal_id_groups[aid]['start_speed']) \
+((1-(((data_animal_id_groups[aid].index - timedelta_left) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1]))) * lag)
# adjust end speed (as first observation after window is taken into account)
ind = data_animal_id_groups[aid].index.searchsorted(data_animal_id_groups[aid].index + timedelta_right, side='right') # find index of first element not in window
out_of_range = (ind >= data_animal_id_groups[aid].shape[0])
ind[out_of_range] = 0 # otherwise key error
lag = data_animal_id_groups[aid]['average_speed'].values[ind] # store respective value of first observation not in window
lag[out_of_range] = data_animal_id_groups[aid]['average_speed'][data_animal_id_groups[aid].shape[0] - 1] # set to speed of last observation
data_animal_id_groups[aid]['lag'] = lag # each observation has stored speed of first observation not in window
# end speed is calculated as weighted sum of speed from last observation in window and from first observation after window
data_animal_id_groups[aid]['end_speed'] = ((((data_animal_id_groups[aid].index + timedelta_right) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1])) * lag) + \
((1-(((data_animal_id_groups[aid].index + timedelta_right) - (data_animal_id_groups[aid].index[ind - 1])) /
(data_animal_id_groups[aid].index[ind] - data_animal_id_groups[aid].index[ind - 1]))) * data_animal_id_groups[aid]['end_speed'])
# calculate average acceleration by dividing difference by time (here unit of fps is used)
data_animal_id_groups[aid]['average_acceleration'] = (data_animal_id_groups[aid]['end_speed'] - data_animal_id_groups[aid]['start_speed'])\
/ float(re.search(r'(\d+)', fps).group(0))
# reset index and drop columns
data_animal_id_groups[aid].reset_index(drop=True, inplace=True)
data_animal_id_groups[aid].drop(['lag', 'start_speed', 'end_speed'], axis=1, inplace=True)
return data_animal_id_groups
[docs]def computing_stops(data_animal_id_groups, threshold_speed):
"""
Calculate absolute feature, describing a record as stop, based on threshold.
Calculate absolute feature called 'Stopped' where the value is 1 if 'Average_Speed' <= threshold_speed
and 0 otherwise.
:param data_animal_id_groups: dictionary with 'animal_id' as keys.
:param threshold_speed: integer, defining maximum value for 'average_speed' to be considered as a stop.
:return: dictionary, including variable 'stopped'.
"""
for aid in data_animal_id_groups.keys():
data_animal_id_groups[aid]['stopped'] = np.where(
data_animal_id_groups[aid]['average_speed'] <= threshold_speed, 1,
0)
return data_animal_id_groups
[docs]def distance_by_time(data, frm, to):
"""
Computes the distance between positions for a particular time window for all movers.
:param data: pandas DataFrame with all records of movements.
:param frm: int defining the start of the time window. Note that if time is stored as a date (if input data has time not stored as numeric type it is automatically converted to datetime) parameter has to be set using an datetime format: mkit.distance_by_time(data, "2008-01-01", "2010-10-01")
:param to: Int, defining end point up to where to extract records.
:param to: int defining the end of the time window (inclusive)
:return: pandas DataFrame with animal_id and distance
"""
# use auxiliary functions to get distances for each timestep
data_animal_id_groups = grouping_data(data)
data_animal_id_groups = compute_distance(data_animal_id_groups)
aids = []
distances = []
# iterate over grouped dataframes
for aid in data_animal_id_groups.keys():
# take a subst of each dataframe as defined by the time window
df = data_animal_id_groups[aid]
subset = df[(df['time'] >= frm) & (df['time'] <= to)]
# sum up all distances in that time window
distance = subset['distance'].sum()
aids.append(aid)
distances.append(distance)
return pd.DataFrame({'animal_id': aids, 'distance': distances})
[docs]def group_movement(feats):
"""
Returns aggregated movement data, such as distance, mean speed, mean acceleration and mean distance to centroid for the entire group at each time capture.
:param feats: pd DataFrame with animal-specific data - if no features contained, they will be extracted.
:return: pd DataFrame with group-specific values for each time-capture
"""
# Handling no features in input
if 'distance' not in feats.columns or 'average_speed' not in feats.columns or 'average_acceleration' not in feats.columns:
warnings.warn(
'Recalculating features, since distance, speed or acceleration not found in input.'
)
feats = extract_features(feats)
# Handling no centroid in input
if 'distance_to_centroid' not in feats.columns:
warnings.warn(
'Recalculating centroid-distances, since not found in input dataset!'
)
feats = centroid_medoid_computation(feats)
# Group by time, return new dataframe
data_dist = feats.groupby('time')
group = pd.DataFrame({
"total_dist":
data_dist.sum()['distance'],
"mean_speed":
data_dist.mean()['average_speed'],
"mean_acceleration":
data_dist.mean()['average_acceleration'],
"mean_distance_centroid":
data_dist.mean()['distance_to_centroid']
})
return group
[docs]def centroid_medoid_computation(data,
only_centroid=False,
object_output=False):
"""
Calculates the data point (animal_id) closest to center/centroid/medoid for a time step
Uses group by on 'time' attribute
:param data: Pandas DataFrame containing movement records
:param only_centroid: Boolean in case we just want to compute the centroids. Default: False.
:param object_output: Boolean whether to create a point object for the calculated centroids. Default: False.
:return: Pandas DataFrame containing computed medoids & centroids
"""
# check 3D presence
if presence_3d(data):
is_3d = True
else:
is_3d = False
# Group by 'time'-
data_time = data.groupby('time')
# Dictionary to hold grouped data by 'time' attribute-
data_groups_time = {}
for aid in tqdm(data_time.groups.keys(), position=0, desc="Calculating centroid distances"):
data_groups_time[aid] = data_time.get_group(aid)
data_groups_time[aid].reset_index(drop=True, inplace=True)
# Add 3 additional columns to each group-
data_l = [0 for x in range(data_groups_time[aid].shape[0])]
data_groups_time[aid] = data_groups_time[aid].assign(x_centroid=data_l)
data_groups_time[aid] = data_groups_time[aid].assign(y_centroid=data_l)
if is_3d:
data_groups_time[aid] = data_groups_time[aid].assign(z_centroid=data_l)
if only_centroid == False:
data_groups_time[aid] = data_groups_time[aid].assign(medoid=data_l)
data_groups_time[aid] = data_groups_time[aid].assign(
distance_to_centroid=data_l)
# Calculate centroid coordinates (x, y)-
x_mean = np.around(np.mean(data_groups_time[aid]['x']), 3)
y_mean = np.around(np.mean(data_groups_time[aid]['y']), 3)
if is_3d:
z_mean = np.around(np.mean(data_groups_time[aid]['z']), 3)
data_groups_time[aid] = data_groups_time[aid].assign(x_centroid=x_mean)
data_groups_time[aid] = data_groups_time[aid].assign(y_centroid=y_mean)
if is_3d:
data_groups_time[aid] = data_groups_time[aid].assign(z_centroid=z_mean)
# add additional variable for point object
if object_output == True:
data_groups_time[aid] = data_groups_time[aid].assign(
centroid=functions.ST_MakePoint(x_mean, y_mean))
if only_centroid == False:
# Squared distance of each 'x' coordinate to 'centroid'-
x_temp = (data_groups_time[aid].loc[:, 'x'] - x_mean) ** 2
# Squared distance of each 'y' coordinate to 'centroid'-
y_temp = (data_groups_time[aid].loc[:, 'y'] - y_mean) ** 2
# Squared distance of each 'z' coordinate to 'centroid'-
if is_3d:
z_temp = (data_groups_time[aid].loc[:, 'z'] - z_mean) ** 2
# Distance of each point from centroid-
if is_3d:
dist = np.sqrt(x_temp + y_temp + z_temp)
else:
dist = np.sqrt(x_temp + y_temp)
# Assign computed distances to 'distance_to_centroid' attribute-
data_groups_time[aid] = data_groups_time[aid].assign(
distance_to_centroid=np.around(dist, decimals=3))
# Find 'animal_id' nearest to centroid for this group-
pos = np.argmin(
data_groups_time[aid]['distance_to_centroid'].values)
nearest = data_groups_time[aid].loc[pos, 'animal_id']
# Assign 'medoid' for this group-
data_groups_time[aid] = data_groups_time[aid].assign(
medoid=nearest)
medoid_data = regrouping_data(data_groups_time)
return medoid_data
[docs]def euclidean_dist(data):
"""
Compute the euclidean distance between movers for one individual grouped time step using the Scipy 'pdist' and 'squareform' methods.
:param data: Preprocessed pandas DataFrame with positional record data containing no duplicates.
:return: pandas DataFrame, including computed euclidean distances.
"""
if len(data) >= 200000: # then multiproccessing
# creating list of different animals df's to split them in pool
data = timewise_dict(data)
df_list = []
for aid in data.keys():
df_list.append(data[aid])
# use multiprocessing to call extract_features for each animal with different process
if __name__ == "movekit.feature_extraction":
pool = multiprocessing.Pool()
result = pool.map(euclidean_dist_multiproccessing, df_list)
# regroup in one big data frame and return
big_df = pd.DataFrame()
for df in result:
big_df = big_df.append(df, ignore_index=True)
big_df = big_df.sort_values(by=['time','animal_id'])
big_df = big_df.reset_index(drop=True)
return big_df
if presence_3d(data):
weights = {'x': 1, 'y': 1, 'z': 1}
else:
weights = {'x': 1, 'y': 1}
out = compute_similarity(data, weights)
return out
[docs]def euclidean_dist_multiproccessing(data):
"""
Compute the euclidean distance between movers for one individual grouped time step using the Scipy 'pdist' and 'squareform' methods.
:param data: Preprocessed pandas DataFrame with positional record data containing no duplicates.
:return: pandas DataFrame, including computed euclidean distances.
"""
if presence_3d(data):
weights = {'x': 1, 'y': 1, 'z': 1}
else:
weights = {'x': 1, 'y': 1}
out = compute_similarity_multiproccessing(data, weights)
return out
[docs]def compute_similarity(data, weights, p=2):
"""
Compute positional similarity between animals.
Computing the positional similarity in a distance matrix according to animal_id for each time step.
:param data: pandas DataFrame, containing preprocessed movement records.
:param weights: dictionary, giving variable's weights in weighted distance calculation.
:param p: integer, giving p-norm for Minkowski, weighted and unweighted. Default: 2.
:return: pandas DataFrame, including computed similarities.
"""
w = [] # weight vector
not_allowed_keys = ['time', 'animal_id']
df = pd.DataFrame()
for key in weights:
if key in data.columns:
df[key] = data[key]
w.append(weights[key])
# compute the distance for each time moment
df2 = data.groupby('time')
df3 = pd.DataFrame() # empty df in which all the data frames containing the distance are merged
for start in tqdm(df2.groups.keys(), position=0, desc="Computing euclidean distance"):
groups_df = df2.get_group(start).groupby('time', group_keys=False).apply(similarity_computation, w=w,
p=p) # calculate distance for each time period
df3 = pd.concat([df3, groups_df]) # finally all dataframes are merged in df3
# combine the distance matrix with the data and return
return pd.merge(data, df3, left_index=True,
right_index=True).sort_values(by=['time', 'animal_id'])
[docs]def compute_similarity_multiproccessing(data, weights, p=2):
"""
Compute positional similarity between animals.
Computing the positional similarity in a distance matrix according to animal_id for each time step.
:param data: pandas DataFrame, containing preprocessed movement records.
:param weights: dictionary, giving variable's weights in weighted distance calculation.
:param p: integer, giving p-norm for Minkowski, weighted and unweighted. Default: 2.
:return: pandas DataFrame, including computed similarities.
"""
w = [] # weight vector
not_allowed_keys = ['time', 'animal_id']
df = pd.DataFrame()
for key in weights:
if key in data.columns:
df[key] = data[key]
w.append(weights[key])
df2 = data.groupby('time')
df3 = pd.DataFrame() # empty df in which all the data frames containing the distance are merged
for start in tqdm(df2.groups.keys(), position=0, desc="Computing euclidean distance", disable=True):
groups_df = df2.get_group(start).groupby('time', group_keys=False).apply(similarity_computation, w=w,
p=p) # calculate distance for each time period
df3 = pd.concat([df3, groups_df]) # finally all dataframes are merged in df3
# combine the distance matrix with the data and return
return pd.merge(data, df3, left_index=True,
right_index=True).sort_values(by=['time', 'animal_id'])
[docs]def similarity_computation(group, w, p):
"""
Compute similarity between records.
Compute the Minkowski similarity for one individual grouped time step using the Scipy pdist and squareform methods
:param group: pandas DataFrame, containing preprocessed movement records.
:param w: array, consisting of the weight vector.
:param p: double, applies the respective p-norm for weighted Minkowski.
:return: pandas DataFrame, including the distances of the records.
"""
# ids of each animal
ids = group['animal_id'].tolist()
# compute and assign the distances for each time step
return pd.DataFrame(squareform(pdist(group.loc[:, ['x', 'y']], 'minkowski', p=p)),
index=group.index,
columns=ids)
[docs]def ts_all_features(data):
"""
Perform time series analysis on record data.
:param data: pandas DataFrame, containing preprocessed movement records and features.
:return: pandas DataFrame, containing extracted time series features for each id for each feature.
"""
# Remove the column 'stopped' as it has nominal values and 'direction' as it is a vector and additional columns from Movebank data.
rm_colm = ['stopped', 'direction','event-id', 'visible', 'location-long',
'location-lat', 'behavioural-classification', 'comments',
'study-specific-measurement', 'sensor-type',
'individual-taxon-canonical-name', 'tag-local-identifier',
'study-name']
df = data[data.columns.difference(rm_colm)]
time_series_features = tsfresh.extract_features(df,
column_id='animal_id',
column_sort='time')
tsfresh.utilities.dataframe_functions.impute(time_series_features)
return time_series_features
[docs]def ts_feature(data, feature):
"""
Perform time series analysis by extracting specified time series features from record data.
:param data: pandas DataFrame, containing preprocessed movement records and features.
:param feature: time series feature which is extracted from the movement records.
:return: pandas DataFrame, containing defined extracted time series features for each id for each feature.
"""
fc_parameters = tsfresh.feature_extraction.ComprehensiveFCParameters()
if feature in fc_parameters:
settings = {}
settings[feature] = fc_parameters[feature]
# Remove the column 'stopped' as it has nominal values and 'direction' as it is a vector and additional columns from Movebank data.
rm_colm = ['stopped', 'direction','event-id', 'visible', 'location-long',
'location-lat', 'behavioural-classification', 'comments',
'study-specific-measurement', 'sensor-type','individual-taxon-canonical-name',
'tag-local-identifier', 'study-name']
df = data[data.columns.difference(rm_colm)]
time_series_features = tsfresh.extract_features(
df,
column_id='animal_id',
column_sort='time',
default_fc_parameters=settings)
time_series_features = time_series_features.rename_axis('variable', axis=1) # rename axis
time_series_features = time_series_features.rename_axis("id", axis=0)
return time_series_features
else:
print("Time series feature is not known.")
return
[docs]def explore_features(data):
"""
Show percentage of environment space explored by singular animal.
Using minumum and maximum of coordinates, given by 'x' and 'y' (and 'z') features in input DataFrame.
:param data: pandas DataFrame, containing preprocessed movement records.
:return: None.
"""
# Compute global minimum and maximum if user
# has NOT specified the values-^
x_min = data['x'].min()
x_max = data['x'].max()
y_min = data['y'].min()
y_max = data['y'].max()
if presence_3d(data):
is_3d = True
z_min = data['z'].min()
z_max = data['z'].max()
else:
is_3d = False
# Group 'data' using 'animal_id' attribute-
data_groups = grouping_data(data)
# for each animal, how much of the area has the animal covered?
# % of space explored by each animal? #
for aid in data_groups:
aid_x_min = data_groups[aid]['x'].min()
aid_x_max = data_groups[aid]['x'].max()
aid_y_min = data_groups[aid]['y'].min()
aid_y_max = data_groups[aid]['y'].max()
print("\nAnimal ID: {0} covered % of area:".format(aid))
print(f'x-coordinates: {round(((aid_x_max-aid_x_min)/(x_max - x_min))*100, 2)}')
print(f'y-coordinates: {round(((aid_y_max - aid_y_min) / (y_max - y_min))*100, 2)}')
if is_3d:
aid_z_min = data_groups[aid]['z'].min()
aid_z_max = data_groups[aid]['z'].max()
print(f'z-coordinates: {round(((aid_z_max - aid_z_min) / (z_max - z_min))*100, 2)}')
return None
[docs]def explore_features_geospatial(preprocessed_data):
"""
Show exploration of environment space by each animal using 'shapely' package.
Gives singular descriptions of polygon area covered by each animal and combined.
Additionally a plot of the respective areas is provided.
:param preprocessed_data: pandas DataFrame, containing preprocessed movement records.
:return: None.
"""
with tqdm(total=100, position=0, desc="Calculating covered areas") as pbar:
# Create dictionary, grouping records by animal ID as key
data_groups = grouping_data(preprocessed_data)
percent_update = 100 / (len(data_groups.keys()) + 1) # since also group object is created
# Python dict to hold X-Y coordinates for each animal-
xy_coord = {}
# Polygon for singular animal ID
for aid in data_groups.keys():
xy_coord[aid] = []
# Extract position information per animal into list of xy-tuples
for aid in data_groups.keys():
for x in range(data_groups[aid].shape[0]):
temp_tuple = (data_groups[aid].loc[x,
'x'], data_groups[aid].loc[x,
'y'])
xy_coord[aid].append(temp_tuple)
pbar.update(percent_update)
# Polygons for individual animals
for aid in data_groups.keys():
poly = Polygon(xy_coord[aid]).convex_hull
# Compute area of singular polygons and plot
print(
"\nArea (polygon) covered by animal ID = {0} is = {1:.2f} sq. units\n"
.format(aid, poly.area))
plt.plot(*poly.exterior.xy)
# Polygon for collective group
xy_coord_full = []
for aid in data_groups.keys():
for x in range(data_groups[aid].shape[0]):
temp_tuple = (data_groups[aid].loc[x,
'x'], data_groups[aid].loc[x,
'y'])
xy_coord_full.append(temp_tuple)
# Create 'Polygon' object using all coordinates for animal ID combined
full_poly = Polygon(xy_coord_full).convex_hull
# Compute area of collective polygon and plot
print("\nArea (polygon) covered by animals collectively is = ",
full_poly.area, "sq. units")
plt.plot(*full_poly.exterior.xy, linewidth=5, color="black")
plt.show()
pbar.update(percent_update)
return None
[docs]def outlier_detection(dataset, features=["distance", "average_speed", "average_acceleration",
"stopped", "turning"], remove=False, algorithm="KNN", **kwargs):
"""
Detect outliers based on different pyod algorithms.
Note: User may decide on different parameters specific to algorithm chosen.
:param dataset: Dataframe containing the movement records.
:param features: list of features to detect outliers upon. Default: ["distance", "average_speed", "average_acceleration",
"stopped", "turning"]
:param remove: Boolean deciding whether outliers should be removed in returned dataframe. Default: False (outliers are not removed).
:param algorithm: String defining which algorithm to use for finding the outliers. The following algorithms are available:
"KNN", "ECOD", "COPOD", "ABOD", "MAD", "SOS", "KDE", "Sampling", "GMM", "PCA", "KPCA",
"MCD", "CD", "OCSVM", "LMDD", "LOF", "COF", "CBLOF", "LOCI", "HBOS", "SOD", "ROD",
"IForest", "INNE", "LSCP","LODA", "VAE", "SO_GAAL", "MO_GAAL", "DeepSVDD", "AnoGAN", "ALAD", "R-Graph".
Additional available algorithms: FastABOD: call algorithm="ABOD" with method="fast",
AvgKNN: call algorithm="KNN" with method="mean", MedKNN: call algorithm="KNN" with
method="median". Default algorithm is "KNN". For more information regarding all the
algorithms refer to: https://pyod.readthedocs.io/en/latest/pyod.models.html#
:param kwargs: Specific to the algorithm additional parameters can be specified. For further information about the
algorithm-specific parameters once again refer to:
https://pyod.readthedocs.io/en/latest/pyod.models.html#
For the default algorithm "KNN" some additional parameters are for example:
contamination - the contamination threshold, n_neighbors - the number of neighbors,
method - which KNN to use, and metric - for distance computation.
:return: Dataframe containing information for each movement record whether outlier or not.
"""
if algorithm == "KNN":
from pyod.models.knn import KNN
clf = KNN(**kwargs)
elif algorithm == "ECOD":
from pyod.models.ecod import ECOD
clf = ECOD(**kwargs)
elif algorithm == "ABOD":
from pyod.models.abod import ABOD
clf = ABOD(**kwargs)
elif algorithm == "COPOD":
from pyod.models.copod import COPOD
clf = COPOD(**kwargs)
elif algorithm == "MAD":
from pyod.models.mad import MAD
clf = MAD(**kwargs)
elif algorithm == "SOS":
from pyod.models.sos import SOS
clf = SOS(**kwargs)
elif algorithm == "KDE":
from pyod.models.kde import KDE
clf = KDE(**kwargs)
elif algorithm == "Sampling":
from pyod.models.sampling import Sampling
clf = Sampling(**kwargs)
elif algorithm == "GMM":
from pyod.models.gmm import GMM
clf = GMM(**kwargs)
elif algorithm == "PCA":
from pyod.models.pca import PCA
clf = PCA(**kwargs)
elif algorithm == "KPCA":
from pyod.models.kpca import KPCA
clf = KPCA(**kwargs)
elif algorithm == "MCD":
from pyod.models.mcd import MCD
clf = MCD(**kwargs)
elif algorithm == "CD":
from pyod.models.cd import CD
clf = CD(**kwargs)
elif algorithm == "OCSVM":
from pyod.models.ocsvm import OCSVM
clf = OCSVM(**kwargs)
elif algorithm == "LMDD":
from pyod.models.lmdd import LMDD
clf = LMDD(**kwargs)
elif algorithm == "LOF":
from pyod.models.lof import LOF
clf = LOF(**kwargs)
elif algorithm == "COF":
from pyod.models.cof import COF
clf = COF(**kwargs)
elif algorithm == "CBLOF":
from pyod.models.cblof import CBLOF
clf = CBLOF(**kwargs)
elif algorithm == "LOCI":
from pyod.models.loci import LOCI
clf = LOCI(**kwargs)
elif algorithm == "HBOS":
from pyod.models.hbos import HBOS
clf = HBOS(**kwargs)
elif algorithm == "SOD":
from pyod.models.sod import SOD
clf = SOD(**kwargs)
elif algorithm == "ROD":
from pyod.models.rod import ROD
clf = ROD(**kwargs)
elif algorithm == "IForest":
from pyod.models.iforest import IForest
clf = IForest(**kwargs)
elif algorithm == "INNE":
from pyod.models.inne import INNE
clf = INNE(**kwargs)
elif algorithm == "FB" or algorithm == "FeatureBagging":
warnings.warn('At the moment this algorithm is not supported by movekit.')
elif algorithm == "LSCP":
from pyod.models.lscp import LSCP
clf = LSCP(**kwargs)
elif algorithm == "XGBOD":
warnings.warn('At the moment this algorithm is not supported by movekit.')
elif algorithm == "LODA":
from pyod.models.loda import LODA
clf = LODA(**kwargs)
from pyod.models.loda import LODA
elif algorithm == "SUOD":
warnings.warn('At the moment this algorithm is not supported by movekit.')
elif algorithm == "AutoEncoder":
warnings.warn('At the moment this algorithm is not supported by movekit.')
elif algorithm == "VAE":
from pyod.models.vae import VAE
clf = VAE(**kwargs)
elif algorithm == "SO_GAAL":
from pyod.models.so_gaal import SO_GAAL
clf = SO_GAAL(**kwargs)
elif algorithm == "MO_GAAL":
from pyod.models.mo_gaal import MO_GAAL
clf = MO_GAAL(**kwargs)
elif algorithm == "DeepSVDD":
from pyod.models.deep_svdd import DeepSVDD
clf = DeepSVDD(**kwargs)
elif algorithm == "AnoGAN":
from pyod.models.anogan import AnoGAN
clf = AnoGAN(**kwargs)
elif algorithm == "ALAD":
from pyod.models.alad import ALAD
clf = ALAD(**kwargs)
elif algorithm == "R-Graph" or algorithm == "RGraph":
from pyod.models.rgraph import RGraph
clf = RGraph(**kwargs)
elif algorithm == "LUNAR":
warnings.warn('At the moment this algorithm is not supported by movekit.')
inp_data = dataset.loc[:, features]
clf.fit(inp_data)
scores_pred = clf.predict(inp_data)
# avoid overwriting input
# Inserting column, with 1 if outlier, else 0
if "outlier" in dataset:
dataset["outlier"] = scores_pred
else:
dataset.insert(2, "outlier", scores_pred)
if remove:
dataset = dataset.loc[dataset["outlier"]==0, :]
return dataset
[docs]def split_movement_trajectory(data, stop_threshold=0.5, csv=False):
"""
Split trajectories of movers in stopping and moving phases.
:param data: pandas DataFrame containing preprocessed movement records.
:param stop_threshold: integer to specify threshold for average speed, such that we consider timestamp a "stop".
:param csv: Boolean, defining if each phase shall be exported locally as singular csv.
:return: dictionary with animal_id as key and list of individual dataFrames for each movement phase as values.
"""
if not (set(['distance', 'average_speed', 'average_acceleration', 'direction', 'stopped', 'turning']).issubset(
data.columns)):
warnings.warn('Some features are missing and thus will be extracted first. Note that is recommended to use function '
'extract_features prior to this function to define values for calculation of average speed and stopping criteria.'
'The returned Data Frame should afterwards be used as input for this function.')
data = extract_features(data, stop_threshold=stop_threshold)
data_groups = grouping_data(data)
df_dict = {}
for aid in data_groups.keys():
df = data_groups[aid]
df_dict[aid] = []
beg = 0
for i in range(1, len(df)):
if (df.stopped[i] != df.stopped[i - 1]):
phase_df = data_groups[aid].iloc[beg:i, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
beg = i
# for the last one
if (df.stopped[len(df) - 1] == df.stopped[len(df) - 2]):
phase_df = data_groups[aid].iloc[beg:, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
else:
phase_df = data_groups[aid].iloc[(len(df) - 1):, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
print(f' For animal {aid} the trajectory was split in {len(df_dict[aid])} phases of moving and stopping.')
if csv == True:
for aid in df_dict.keys():
for i in range(len(df_dict[aid])):
df_dict[aid][i].to_csv(f'{aid}_{i}.csv', index=False)
return df_dict
[docs]def movement_stopping_durations(data, stop_threshold=0.5):
"""
Split trajectories of movers in stopping and moving phases and return the duration of each phase.
:param data: pandas DataFrame containing preprocessed movement records.
:param stop_threshold: integer to specify threshold for average speed, such that we consider timestamp a "stop".
:return: dictionary with animal_id as key and DataFrame with the different phases and their durations as value.
"""
data = split_movement_trajectory(data, stop_threshold)
df_dict = {}
for aid in data.keys():
df_list = data[aid]
time_df = pd.DataFrame({'animal_id': aid}, index=[0])
for index, df in enumerate(df_list):
try:
time_diff = df_list[index+1].time[0] - df.time[0]
except: # for the last phase
time_diff = (df.time[len(df)-1] - df.time[0]) + (df.time[0] - df_list[index-1].time[len(df_list[index-1])-1])
if len(time_df.columns) != 1: # adjust if only one phase for mover
time_df[f'Duration of phase {index+1} ({"stopping" if df["stopped"][0] == 1 else "moving"})'] = time_diff
else:
time_df[f'Duration of phase {index + 1} ({"stopping" if df["stopped"][0] == 1 else "moving"})'] = len(df)
df_dict[aid] = time_df
return df_dict
[docs]def hausdorff_distance(data, mover1=None, mover2=None):
"""
Calculate the Hausdorff-Distance between trajectories of different movers.
:param data: pandas DataFrame containing movement records.
:param mover1: animal_id of the first mover if Hausdorff distance is just to be calculated between two movers.
:param mover2: animal_id of the second mover if Hausdorff distance is just to be calculated between two movers
:return: Hausdorff distance between two specified movers. If no movers are specified, Hausdorff distance between
all movers in the data to each other as a Pandas DataFrame.
"""
presence3d = presence_3d(data)
data = grouping_data(data)
if mover1 is None and mover2 is None:
df = pd.DataFrame()
for aid1 in data.keys():
dict = {}
for aid2 in data.keys():
if (presence3d):
hdf_distance1 = directed_hausdorff(data[aid1][["x", "y", "z"]].values,
data[aid2][["x", "y", "z"]].values)[0]
hdf_distance2 = directed_hausdorff(data[aid2][["x", "y", "z"]].values,
data[aid1][["x", "y", "z"]].values)[0]
hdf_distance = max(hdf_distance1, hdf_distance2)
else:
hdf_distance1 = directed_hausdorff(data[aid1][["x", "y"]].values,
data[aid2][["x", "y"]].values)[0]
hdf_distance2 = directed_hausdorff(data[aid2][["x", "y"]].values,
data[aid1][["x", "y"]].values)[0]
hdf_distance = max(hdf_distance1, hdf_distance2)
dict[aid2] = hdf_distance
aid_df = pd.DataFrame(dict, index=[aid1])
df = pd.concat([df, aid_df])
return df
else:
if presence3d:
hdf_distance1 = directed_hausdorff(data[mover1][["x", "y", "z"]].values,
data[mover2][["x", "y", "z"]].values)[0]
hdf_distance2 = directed_hausdorff(data[mover2][["x", "y", "z"]].values,
data[mover1][["x", "y", "z"]].values)[0]
hdf_distance = max(hdf_distance1, hdf_distance2)
else:
hdf_distance1 = directed_hausdorff(data[mover1][["x", "y"]].values,
data[mover2][["x", "y"]].values)[0]
hdf_distance2 = directed_hausdorff(data[mover2][["x", "y"]].values,
data[mover1][["x", "y"]].values)[0]
hdf_distance = max(hdf_distance1, hdf_distance2)
return hdf_distance
[docs]def segment_data(data, feature, threshold, csv=False, fps=10, stop_threshold=0.5):
"""
Segment data in subsets by feature values using a given threshold value. For instance, by using the average speed as feature split the dataset in segments above and below a given threshold.
:param data: dataframe containing the feature which is used to split the dataset. Note that if feature is 'distance', 'average_speed', 'average_acceleration', 'direction', 'stopped' or 'turning',
feature can also be extracted within the function. In that case one should define the input parameters to use when extract_features() is called.
:param feature: column name of the feature used to split data in subsets.
:param threshold: threshold used to split data according to feature value.
:param csv: Boolean, defining if each subset shall be exported locally as singular csv.
:param fps: used if features are not extracted before but within the function by calling extract_features():
size of window used to calculate average speed and average acceleration:
integer to define size of window for integer-formatted time or string to define size of window for datetime-formatted time (For possible units refer to:https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases.)
:param stop_threshold: used if features are not extracted before but within the function by calling extract_features():
integer to specify threshold for average speed, such that we consider timestamp a "stop".
:return: dictionary with id of different movers as key and a list of all the subsets for this mover as values. Subsets are thereby stored as dataframe.
"""
if feature not in data.columns:
if feature not in ['distance', 'average_speed', 'average_acceleration', 'direction', 'stopped', 'turning']:
warnings.warn('Given feature to segment data is not a column name of the data. Please redefine the feature or add feature to the data.')
else:
warnings.warn('Feature to segment data is missing and thus will be extracted first. Note that it is recommended to use function '
'extract_features prior to this function to define values for calculation of average speed and stopping criteria.'
'The returned Data Frame should afterwards be used as input for this function.')
data = extract_features(data, fps=fps, stop_threshold=stop_threshold)
data_groups = grouping_data(data)
df_dict = {}
for aid in data_groups.keys():
df = data_groups[aid]
df_dict[aid] = []
beg = 0
for i in range(1, len(df)):
if (df[feature][i] <= threshold and df[feature][i-1]>threshold):
phase_df = data_groups[aid].iloc[beg:i, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
beg = i
elif (df[feature][i] > threshold and df[feature][i-1]<=threshold):
phase_df = data_groups[aid].iloc[beg:i, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
beg = i
# for the last one
if ((df[feature][len(df) - 1] > threshold and df[feature][len(df) - 2] > threshold) or
(df[feature][len(df) - 1] <= threshold and df[feature][len(df) - 2] <= threshold)):
phase_df = data_groups[aid].iloc[beg:, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
else:
phase_df = data_groups[aid].iloc[(len(df) - 1):, :]
phase_df = phase_df.reset_index(drop=True)
df_dict[aid].append(phase_df)
print(f' For animal {aid} the trajectory was split in {len(df_dict[aid])} phases.')
if csv == True:
for aid in df_dict.keys():
for i in range(len(df_dict[aid])):
df_dict[aid][i].to_csv(f'{aid}_{i}.csv', index=False)
return df_dict
[docs]def outlier_by_threshold(data, feature_thresholds, remove=False):
"""
Identify outliers by user given features with specific minimum and maximum thresholds.
:param data: data on which outliers are detected.
:param feature_thresholds: dictionary containing the features as keys and the minimum/maximum threshold as two element list.
For example if one would only want to declare all data points having an average speed < 5 and > 10 as outliers:
feature_threshold = {'average_speed': [5,10]}
:param remove: Boolean deciding whether outliers should be removed in returned dataframe. Default: False (outliers are not removed).
:return: Dataframe containing information for each record whether it is an outlier according to the defined threshold values.
"""
data['outlier_by_threshold'] = np.zeros(len(data))
for k in feature_thresholds.keys():
data.loc[(data[k] < feature_thresholds[k][0]) | (data[k] > feature_thresholds[k][1]), 'outlier_by_threshold'] = 1
if remove:
data = data.loc[data['outlier_by_threshold'] == 0, :]
data['outlier_by_threshold'] = pd.to_numeric(data['outlier_by_threshold'], downcast='integer')
return data
[docs]def getis_ord(data, x_grids_per_t=3, y_grids_per_t=3, time_grids=3):
"""
Calculate the Getis-Ord G* statistic for each x-y-time interval of the data. Interval size is specified by input.
For more information about how the statistic is calculated please refer to: https://sigspatial2016.sigspatial.org/giscup2016/problem
:param data: pandas Data frame containing the movement data in the columns x, y and time.
:param x_grids_per_t: int defining how many x intervals there are for each time step. The x axis is subdivided uniformly,
i.e. if the maximum value of x in the data is 100 and the minimum value is 10, by setting x_grids_per_t = 3 for each
time step there are 3 intervals ([10,40),[40,70),[70,100])
:param y_grids_per_t: int defining how many y intervals there are for each time step. The y axis is subdivided uniformly,
i.e. if the maximum value of y in the data is 50 and the minimum value is 10, by setting y_grids_per_t = 4 for each
time step there are 4 intervals ([10,20),[20,30),[30,40),[50,50])
:param time_grids: int defining how many time intervals there are. The time axis is subdivided uniformly, i.e. if
the maximum value of time in the data is 500 and the minimum value is 0, by setting time_grids = 5 there are 5 time
intervals ([0,100),[100,200),[200,300),[300,400),[400,500])
Note that if one defines f.e. x_grids_per_t = 3, y_grids_per_t = 3 and time_grids = 5 the space time cube used for
calculating G* contains 3*3*5=45 intervals.
return: Pandas data frame containing the Getis-Ord statistic for each examined interval (intervals are defined by six
columns defining the respective start and end values of the intervals' x-coordinate, y-coordinate and time.
"""
# calculate interval values
time_range = data['time'].max() - data['time'].min()
time_min = data['time'].min()
x_range = data['x'].max() - data['x'].min()
x_min = data['x'].min()
y_range = data['y'].max() - data['y'].min()
y_min = data['y'].min()
int_length_x = x_range / x_grids_per_t
int_length_y = y_range / y_grids_per_t
int_length_t = time_range / time_grids
points = []
grid = []
scores = {}
for x in range(x_grids_per_t):
for y in range(y_grids_per_t):
for t in range(time_grids):
points.append((x, y, t))
grid.append(([x_min + x * int_length_x, x_min + (x+1) * int_length_x],
[y_min + y * int_length_y, y_min + (y+1) * int_length_y],
[time_min + t * int_length_t, time_min + (t+1) * int_length_t])) # interval values for x, y, t
scores[(x, y, t)] = 0
# assign observations to intervals
df = data[['x', 'y', 'time']]
for r in range(len(data)):
x_val, y_val, t_val = -1, -1, -1
for x in range(x_grids_per_t):
if (df.iloc[r, 0] >= (x_min + x * int_length_x)) and (df.iloc[r, 0] < (x_min + (x+1) * int_length_x)):
x_val = x
break
if x_val == -1:
if df.iloc[r, 0] == data['x'].max(): # observations having end values of last interval as value
x_val = x
for y in range(y_grids_per_t):
if (df.iloc[r, 1] >= (y_min + y * int_length_y)) and (df.iloc[r, 1] < (y_min + (y+1) * int_length_y)):
y_val = y
break
if y_val == -1:
if df.iloc[r, 1] == data['y'].max(): # observations having end values of last interval as value
y_val = y
for t in range(time_grids):
if (df.iloc[r, 2] >= (time_min + t * int_length_t)) and (df.iloc[r, 2] < (time_min + (t+1) * int_length_t)):
t_val = t
break
if t_val == -1:
if df.iloc[r, 2] == data['time'].max(): # observations having end values of last interval as value
t_val = t
scores[(x_val, y_val, t_val)] = scores[(x_val, y_val, t_val)] + 1
scores = np.array(list(scores.values()))
# calculate weights in space time cube
d = squareform(pdist(np.array(points)))
d[d <= 1.9] = 1
d[d > 1] = 0
for i in range(len(d)):
sum = d[i].sum()
d[i][d[i] == 1] = sum / len(d[i])
# calculate G*
n = len(scores)
nom_one = d @ scores # first term nominator
x_mean = np.mean(scores)
nom_two = x_mean * np.sum(d, axis=1) # second term nominator
S = np.sqrt(((scores ** 2).sum()) / n - x_mean ** 2) # S
denom_root = np.sqrt((n * np.sum(d ** 2, axis=1) - (np.sum(d, axis=1)) ** 2) / (n - 1)) # denominator root
nom = nom_one - nom_two
denom = S * denom_root
G = nom / denom
if not np.all(denom):
warnings.warn('The calculated Getis-Ord Scores are NaN for some intervals because the intervals are neighbors'
' with all other intervals. To avoid this problem one has to define either a higher number for '
'x_grids_per_t or y_grid_per_t or time_grids.')
# return g scores in data frame
score_df = pd.DataFrame({'time0': np.unique(grid, axis=0)[:, 2, 0].tolist(),
'time1': np.unique(grid, axis=0)[:, 2, 1].tolist(),
'x0': np.unique(grid, axis=0)[:, 0, 0].tolist(),
'x1': np.unique(grid, axis=0)[:, 0, 1].tolist(),
'y0': np.unique(grid, axis=0)[:, 1, 0].tolist(),
'y1': np.unique(grid, axis=0)[:, 1, 1].tolist(),
'Getis-Ord Score': G})
score_df.sort_values(['time0', 'x0', 'y0'], inplace=True)
return score_df.reset_index(drop=True)