Aggregated time duplication analysis for the Java-format datasets¶
In [115]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
time_from_start_columns = ['registeredTime', 'scheduledTime']
delta_columns = ['registeredDelta', 'scheduledDelta']
def convert_to_timedelta(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
for column in time_from_start_columns:
try:
copy[column] = pd.to_timedelta(copy[column])
except ValueError: # i.e. negative values
copy[column] = copy[column]
for column in delta_columns:
copy[column] = pd.to_timedelta(copy[column]).dt.total_seconds() * 1000 # ms
return copy
def skip_trailing_1s(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
ft, lt = copy['registeredTime'].min(), copy['registeredTime'].max()
s = pd.Timedelta(seconds=1)
ft1s, lt1s = ft + s, lt - s
return copy[(copy['registeredTime'] >= ft1s) & (copy['registeredTime'] <= lt1s)]
Reading and time converting¶
In [116]:
files_prefix = f'../../data'
folder = 'java-set/bamboo_r'
set_path = f'{files_prefix}/{folder}'
comment_mark = '#'
separator = '\\s+'
dfs = []
for f in listdir(set_path):
filepath = join(set_path, f)
if isfile(filepath):
data = pd.read_csv(filepath, sep=separator, comment=comment_mark)
data = convert_to_timedelta(data)
data = skip_trailing_1s(data)
dfs.append(data)
Sizes¶
In [117]:
print('Size of each dataframe:', [df.index.size for df in dfs])
Size of each dataframe: [2508, 2213, 2263, 1796, 1870]
Aggregated statistics of the data¶
In [118]:
def get_statistics(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy['registeredTime'] = copy['registeredTime'].dt.total_seconds()
copy['scheduledTime'] = copy['scheduledTime'].dt.total_seconds()
return copy.describe()
statistics = [get_statistics(df) for df in dfs]
concatenated_statistics = (pd.concat(statistics)
.reset_index()
.groupby('index')
.median())
display(concatenated_statistics)
absoluteTime | registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||
25% | 1.725878e+09 | 9.099000 | 9.090000 | 7.000000 | 0.000000 | 40.179313 | -70.604856 | 0.0 | 0.0 | 0.478006 |
50% | 1.725882e+09 | 13.657000 | 13.647000 | 8.000000 | 0.000000 | 60.135771 | -50.239566 | 0.0 | 0.0 | 0.492669 |
75% | 1.725887e+09 | 17.899500 | 17.889000 | 8.000000 | 16.000000 | 81.128141 | -31.860673 | 0.0 | 0.0 | 0.531769 |
count | 2.213000e+03 | 2213.000000 | 2213.000000 | 2213.000000 | 2213.000000 | 2213.000000 | 2213.000000 | 2213.0 | 2213.0 | 2213.000000 |
max | 1.725891e+09 | 22.142000 | 22.134000 | 8.000000 | 25.000000 | 110.770286 | -9.060076 | 0.0 | 0.0 | 0.556207 |
mean | 1.725882e+09 | 13.656585 | 13.648803 | 7.502872 | 7.498453 | 61.529708 | -51.683226 | 0.0 | 0.0 | 0.487499 |
min | 1.725874e+09 | 5.171000 | 5.165000 | 7.000000 | 0.000000 | 13.493458 | -100.755020 | 0.0 | 0.0 | 0.380254 |
std | 4.794281e+03 | 4.794281 | 4.792221 | 0.500104 | 8.481888 | 25.710656 | 24.655257 | 0.0 | 0.0 | 0.039514 |
In [119]:
statistics_by_registered = []
counted_by_registered = []
for data in dfs:
count = data['registeredTime'].value_counts()
statistics_by_registered.append(count.describe())
counted_by_registered.append(count.value_counts().sort_index())
display(pd.concat(statistics_by_registered, axis=1).median(axis=1))
display(pd.concat(counted_by_registered, axis=1).median(axis=1))
count 2213.0 mean 1.0 std 0.0 min 1.0 25% 1.0 50% 1.0 75% 1.0 max 1.0 dtype: float64
count 1 2213.0 dtype: float64
Scheduled time¶
In [120]:
statistics_by_scheduled = []
counted_by_scheduled = []
for data in dfs:
count = data['scheduledTime'].value_counts()
statistics_by_scheduled.append(count.describe())
counted_by_scheduled.append(count.value_counts().sort_index())
display(pd.concat(statistics_by_scheduled, axis=1).median(axis=1))
display(pd.concat(counted_by_scheduled, axis=1).median(axis=1))
count 1010.000000 mean 2.180154 std 0.613783 min 1.000000 25% 2.000000 50% 2.000000 75% 3.000000 max 4.000000 dtype: float64
count 1 99.0 2 632.0 3 252.0 4 10.0 dtype: float64
Calculating frequency¶
In [121]:
def calculate_frequency(df: pd.DataFrame, time_column: str, without_duplicates: bool) -> float:
timestamps = df[time_column]
if without_duplicates:
timestamps = df.drop_duplicates(subset=time_column, keep='last')[time_column]
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
By registered time¶
In [122]:
rt_frequency_s = np.array(
[calculate_frequency(df, time_column='registeredTime', without_duplicates=False) for df in dfs])
rt_median_frequency_s = np.median(rt_frequency_s)
rt_median_frequency_hz = 1 / rt_median_frequency_s
print(f"Median register frequency: {rt_median_frequency_s} s or {rt_median_frequency_hz} Hz")
rt_frequency_s_without_duplicates = np.array(
[calculate_frequency(df, time_column='registeredTime', without_duplicates=True) for df in dfs])
rt_median_frequency_s_without_duplicates = np.median(rt_frequency_s_without_duplicates)
rt_median_frequency_hz_without_duplicates = 1 / rt_median_frequency_s_without_duplicates
print(f"Without duplicates: {rt_median_frequency_s_without_duplicates} s "
f"or {rt_median_frequency_hz_without_duplicates} Hz")
concatenated_rt_periods = pd.concat([df.loc[1:, 'registeredDelta'] for df in dfs])
rt_max_period = concatenated_rt_periods.max()
print(f"Register max period: {rt_max_period / 1000} s")
Median register frequency: 0.007502942750133761 s or 133.2810382942309 Hz Without duplicates: 0.007502942750133761 s or 133.2810382942309 Hz Register max period: 0.008 s
By scheduled time¶
In [123]:
st_frequency_s = np.array(
[calculate_frequency(df, time_column='scheduledTime', without_duplicates=False) for df in dfs])
st_median_frequency_s = np.median(st_frequency_s)
st_median_frequency_hz = 1 / st_median_frequency_s
print(f"Median schedule frequency: {st_median_frequency_s} s or {st_median_frequency_hz} Hz")
st_frequency_without_duplicates_s = np.array(
[calculate_frequency(df, time_column='scheduledTime', without_duplicates=True) for df in dfs])
st_median_frequency_s_without_duplicates = np.median(st_frequency_without_duplicates_s)
st_median_frequency_hz_without_duplicates = 1 / st_median_frequency_s_without_duplicates
print(f"Without duplicates: {st_median_frequency_s_without_duplicates} s "
f"or {st_median_frequency_hz_without_duplicates} Hz")
concatenated_st_periods = pd.concat([df.loc[1:, 'scheduledDelta'] for df in dfs])
rt_max_period = concatenated_st_periods.max()
print(f"Schedule max period: {rt_max_period / 1000} s")
Median schedule frequency: 0.007499732477260567 s or 133.33808946279518 Hz Without duplicates: 0.016363548698167793 s or 61.111438505510044 Hz Schedule max period: 0.026 s
Displaying of time series¶
In [124]:
import plotly.graph_objects as go
def display_time_series(selected_time_column: str):
fig = go.Figure()
time_column = pd.concat(dfs)[selected_time_column].sort_values().map(format_time)
for i, df in enumerate(dfs):
fig.add_trace(go.Scatter(x=time_column, y=df['x'], mode='lines', name=f'x{i}'))
fig.add_trace(go.Scatter(x=time_column, y=df['y'], mode='lines', name=f'y{i}'))
fig.update_layout(
title='Coordinates by time',
xaxis_title=f'{selected_time_column} (m:s.ms)',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=100, tickangle=30)
fig.show(renderer='notebook_connected')
def format_time(td: pd.Timedelta):
return '{:02d}:{:06.3f}'.format(int(td.total_seconds() // 60), td.total_seconds() % 60)
By registered time¶
In [125]:
display_time_series('registeredTime')
By scheduled time¶
In [126]:
display_time_series('scheduledTime')