Aggregated time duplication analysis for the Java-format datasets¶
In [127]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
time_from_start_columns = ['registeredTime', 'scheduledTime']
delta_columns = ['registeredDelta', 'scheduledDelta']
def convert_to_timedelta(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
for column in time_from_start_columns:
try:
copy[column] = pd.to_timedelta(copy[column])
except ValueError: # i.e. negative values
copy[column] = copy[column]
for column in delta_columns:
copy[column] = pd.to_timedelta(copy[column]).dt.total_seconds() * 1000 # ms
return copy
def skip_trailing_1s(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
ft, lt = copy['registeredTime'].min(), copy['registeredTime'].max()
s = pd.Timedelta(seconds=1)
ft1s, lt1s = ft + s, lt - s
return copy[(copy['registeredTime'] >= ft1s) & (copy['registeredTime'] <= lt1s)]
Reading and time converting¶
In [128]:
files_prefix = f'../../data'
folder = 'java-set/huion_l_r'
set_path = f'{files_prefix}/{folder}'
comment_mark = '#'
separator = '\\s+'
dfs = []
for f in listdir(set_path):
filepath = join(set_path, f)
if isfile(filepath):
data = pd.read_csv(filepath, sep=separator, comment=comment_mark)
data = convert_to_timedelta(data)
data = skip_trailing_1s(data)
dfs.append(data)
Sizes¶
In [129]:
print('Size of each dataframe:', [df.index.size for df in dfs])
Size of each dataframe: [1729, 1717, 1592, 1452, 1581, 1788, 2071, 2071, 1769, 1861]
Aggregated statistics of the data¶
In [130]:
def get_statistics(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy['registeredTime'] = copy['registeredTime'].dt.total_seconds()
copy['scheduledTime'] = copy['scheduledTime'].dt.total_seconds()
return copy.describe()
statistics = [get_statistics(df) for df in dfs]
concatenated_statistics = (pd.concat(statistics)
.reset_index()
.groupby('index')
.median())
display(concatenated_statistics)
absoluteTime | registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||
25% | 1.714386e+12 | 4.848875 | 4.849375 | 0.000000 | 0.000000 | 39.435000 | -71.251875 | 0.0 | 0.0 | 0.0 |
50% | 1.714386e+12 | 6.771500 | 6.771500 | 1.500000 | 1.000000 | 61.572375 | -52.296188 | 0.0 | 0.0 | 0.0 |
75% | 1.714386e+12 | 8.785000 | 8.785500 | 9.625000 | 10.000000 | 79.407750 | -32.869969 | 0.0 | 0.0 | 0.0 |
count | 1.749000e+03 | 1749.000000 | 1749.000000 | 1749.000000 | 1749.000000 | 1749.000000 | 1749.000000 | 1749.0 | 1749.0 | 1749.0 |
max | 1.714386e+12 | 10.813000 | 10.813000 | 28.500000 | 31.000000 | 106.743375 | -5.736000 | 0.0 | 0.0 | 0.0 |
mean | 1.714386e+12 | 6.775154 | 6.775682 | 4.598206 | 4.598206 | 60.439182 | -52.016447 | 0.0 | 0.0 | 0.0 |
min | 1.714386e+12 | 2.877000 | 2.878000 | 0.000000 | 0.000000 | 11.651250 | -99.663000 | 0.0 | 0.0 | 0.0 |
std | 2.319017e+03 | 2.319017 | 2.318927 | 6.019953 | 6.155643 | 24.637504 | 24.446360 | 0.0 | 0.0 | 0.0 |
In [131]:
statistics_by_registered = []
counted_by_registered = []
for data in dfs:
count = data['registeredTime'].value_counts()
statistics_by_registered.append(count.describe())
counted_by_registered.append(count.value_counts().sort_index())
display(pd.concat(statistics_by_registered, axis=1).median(axis=1))
display(pd.concat(counted_by_registered, axis=1).median(axis=1))
count 895.000000 mean 1.837399 std 1.011679 min 1.000000 25% 1.000000 50% 1.000000 75% 3.000000 max 4.000000 dtype: float64
count 1 494.5 2 131.5 3 232.5 4 60.0 dtype: float64
Scheduled time¶
In [132]:
statistics_by_scheduled = []
counted_by_scheduled = []
for data in dfs:
count = data['scheduledTime'].value_counts()
statistics_by_scheduled.append(count.describe())
counted_by_scheduled.append(count.value_counts().sort_index())
display(pd.concat(statistics_by_scheduled, axis=1).median(axis=1))
display(pd.concat(counted_by_scheduled, axis=1).median(axis=1))
count 890.500000 mean 1.904226 std 1.045096 min 1.000000 25% 1.000000 50% 1.000000 75% 3.000000 max 4.000000 dtype: float64
count 1 480.5 2 116.0 3 245.0 4 71.0 dtype: float64
Calculating frequency¶
In [133]:
def calculate_frequency(df: pd.DataFrame, time_column: str, without_duplicates: bool) -> float:
timestamps = df[time_column]
if without_duplicates:
timestamps = df.drop_duplicates(subset=time_column, keep='last')[time_column]
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
By registered time¶
In [134]:
rt_frequency_s = np.array(
[calculate_frequency(df, time_column='registeredTime', without_duplicates=False) for df in dfs])
rt_median_frequency_s = np.median(rt_frequency_s)
rt_median_frequency_hz = 1 / rt_median_frequency_s
print(f"Median register frequency: {rt_median_frequency_s} s or {rt_median_frequency_hz} Hz")
rt_frequency_s_without_duplicates = np.array(
[calculate_frequency(df, time_column='registeredTime', without_duplicates=True) for df in dfs])
rt_median_frequency_s_without_duplicates = np.median(rt_frequency_s_without_duplicates)
rt_median_frequency_hz_without_duplicates = 1 / rt_median_frequency_s_without_duplicates
print(f"Without duplicates: {rt_median_frequency_s_without_duplicates} s "
f"or {rt_median_frequency_hz_without_duplicates} Hz")
concatenated_rt_periods = pd.concat([df.loc[1:, 'registeredDelta'] for df in dfs])
rt_max_period = concatenated_rt_periods.max()
print(f"Register max period: {rt_max_period / 1000} s")
Median register frequency: 0.004594600280504908 s or 217.64678948091395 Hz Without duplicates: 0.008415922125832205 s or 118.82239225224716 Hz Register max period: 0.032 s
By scheduled time¶
In [135]:
st_frequency_s = np.array(
[calculate_frequency(df, time_column='scheduledTime', without_duplicates=False) for df in dfs])
st_median_frequency_s = np.median(st_frequency_s)
st_median_frequency_hz = 1 / st_median_frequency_s
print(f"Median schedule frequency: {st_median_frequency_s} s or {st_median_frequency_hz} Hz")
st_frequency_without_duplicates_s = np.array(
[calculate_frequency(df, time_column='scheduledTime', without_duplicates=True) for df in dfs])
st_median_frequency_s_without_duplicates = np.median(st_frequency_without_duplicates_s)
st_median_frequency_hz_without_duplicates = 1 / st_median_frequency_s_without_duplicates
print(f"Without duplicates: {st_median_frequency_s_without_duplicates} s "
f"or {st_median_frequency_hz_without_duplicates} Hz")
concatenated_st_periods = pd.concat([df.loc[1:, 'scheduledDelta'] for df in dfs])
rt_max_period = concatenated_st_periods.max()
print(f"Schedule max period: {rt_max_period / 1000} s")
Median schedule frequency: 0.004594358734611189 s or 217.65823214164573 Hz Without duplicates: 0.008767232767232767 s or 114.06107566089335 Hz Schedule max period: 0.034 s
Displaying of time series¶
In [136]:
import plotly.graph_objects as go
def display_time_series(selected_time_column: str):
fig = go.Figure()
time_column = pd.concat(dfs)[selected_time_column].sort_values().map(format_time)
for i, df in enumerate(dfs):
fig.add_trace(go.Scatter(x=time_column, y=df['x'], mode='lines', name=f'x{i}'))
fig.add_trace(go.Scatter(x=time_column, y=df['y'], mode='lines', name=f'y{i}'))
fig.update_layout(
title='Coordinates by time',
xaxis_title=f'{selected_time_column} (m:s.ms)',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=100, tickangle=30)
fig.show(renderer='notebook_connected')
def format_time(td: pd.Timedelta):
return '{:02d}:{:06.3f}'.format(int(td.total_seconds() // 60), td.total_seconds() % 60)
By registered time¶
In [137]:
display_time_series('registeredTime')
By scheduled time¶
In [138]:
display_time_series('scheduledTime')