Time duplication analysis for the data from Wintab diagnostics tool¶
In [115]:
import pandas as pd
def clean_strings(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy = data.rename(columns=lambda h: h.strip())
# drop single-string rows, i.e. prox
copy = copy.dropna(axis=0, how='any')
return copy
def convert_to_timedelta(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
try:
copy['time'] = pd.to_timedelta(copy['time'], unit='ms')
except ValueError: # i.e. negative values
copy['time'] = copy['time']
return copy
def skip_trailing_1s(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
ft, lt = copy['time'].min(), copy['time'].max()
s = pd.Timedelta(seconds=1)
ft1s, lt1s = ft + s, lt - s
return copy[(copy['time'] >= ft1s) & (copy['time'] <= lt1s)]
Reading¶
In [116]:
from os.path import join, isfile
from os import listdir
import pandas as pd
files_prefix = f'../../data'
folder = 'wtcapt-set/huion_l_r'
set_path = f'{files_prefix}/{folder}'
separator = ','
rows_to_skip = 50
dfs = []
for f in listdir(set_path):
filepath = join(set_path, f)
if isfile(filepath):
data = pd.read_csv(filepath, sep=separator, encoding='utf-16', skiprows=rows_to_skip, on_bad_lines='warn')
data = clean_strings(data)
data = convert_to_timedelta(data)
data = skip_trailing_1s(data)
dfs.append(data)
Sizes¶
In [117]:
print("The size of each dataframe: ", [df.index.size for df in dfs])
The size of each dataframe: [1657, 1291, 663, 775, 776, 1113, 1193, 986, 1006, 1374]
Aggregated statistics of the data¶
In [118]:
def get_statistics(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy['time'] = copy['time'].dt.total_seconds()
return copy.describe()
statistics = [get_statistics(df) for df in dfs]
concatenated_statistics = (pd.concat(statistics)
.reset_index()
.groupby('index')
.median())
display(concatenated_statistics)
status | time | serial | csr | bttn | X | Y | npres | azi | alt | |
---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||
25% | 0.0 | 1.727354e+06 | 11082.625000 | 1.0 | 1.0 | 18384.500000 | 13755.500000 | 3027.750000 | 1831.000000 | 560.000000 |
50% | 0.0 | 1.727356e+06 | 11461.250000 | 1.0 | 1.0 | 20059.000000 | 16532.000000 | 3195.000000 | 1869.000000 | 580.000000 |
75% | 0.0 | 1.727358e+06 | 11839.875000 | 1.0 | 1.0 | 22100.750000 | 19784.750000 | 3507.750000 | 1908.000000 | 605.000000 |
count | 1059.5 | 1.059500e+03 | 1059.500000 | 1059.5 | 1059.5 | 1059.500000 | 1059.500000 | 1059.500000 | 1059.500000 | 1059.500000 |
max | 0.0 | 1.727360e+06 | 12218.500000 | 1.0 | 1.0 | 25288.000000 | 25973.000000 | 3935.500000 | 2024.500000 | 645.000000 |
mean | 0.0 | 1.727356e+06 | 11461.250000 | 1.0 | 1.0 | 20133.158563 | 16685.107424 | 3227.883621 | 1870.509678 | 582.892682 |
min | 0.0 | 1.727353e+06 | 10704.000000 | 1.0 | 1.0 | 15266.500000 | 7901.500000 | 2485.000000 | 1756.000000 | 510.000000 |
std | 0.0 | 1.392082e+00 | 305.995609 | 0.0 | 0.0 | 2546.710431 | 4503.766958 | 472.151834 | 58.259407 | 29.338679 |
Grouping by time¶
In [119]:
statistics_by_time = []
counted_by_time = []
for data in dfs:
count = data['time'].value_counts()
statistics_by_time.append(count.describe())
counted_by_time.append(count.value_counts().sort_index())
Aggregated statistics of the number of elements for each group¶
In [120]:
display(pd.concat(statistics_by_time, axis=1).median(axis=1))
display(pd.concat(counted_by_time, axis=1).median(axis=1))
count 308.500000 mean 3.433569 std 0.496473 min 3.000000 25% 3.000000 50% 3.000000 75% 4.000000 max 4.000000 dtype: float64
count 3 174.5 4 134.0 7 19.0 dtype: float64
Calculating the frequency¶
In [121]:
import numpy as np
def calculate_frequency(df: pd.DataFrame, without_duplicates: bool) -> float:
timestamps = df['time']
if without_duplicates:
timestamps = df.drop_duplicates(subset='time', keep='last')['time']
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
frequency_s = np.array(
[calculate_frequency(df, without_duplicates=False) for df in dfs])
median_frequency_s = np.median(frequency_s)
median_frequency_hz = 1 / median_frequency_s
print(f"Median register frequency: {median_frequency_s} s or {median_frequency_hz} Hz")
frequency_s_without_duplicates = np.array(
[calculate_frequency(df, without_duplicates=True) for df in dfs])
median_frequency_s_without_duplicates = np.median(frequency_s_without_duplicates)
median_frequency_hz_without_duplicates = 1 / median_frequency_s_without_duplicates
print(f"Without duplicates: {median_frequency_s_without_duplicates} s "
f"or {median_frequency_hz_without_duplicates} Hz")
concatenated_periods = pd.concat([df['time'].diff() for df in dfs])
max_period = concatenated_periods.max().total_seconds()
print(f"Max period: {max_period} s")
Median register frequency: 0.004539483786105443 s or 220.28936485263435 Hz Without duplicates: 0.015625309514859697 s or 63.998732252247436 Hz Max period: 0.032 s
Displaying of time series with this data¶
In [122]:
import plotly.graph_objects as go
def format_time(td: pd.Timedelta):
return '0:{:06.3f}'.format(td.total_seconds() % 60)
fig = go.Figure()
time_column = pd.concat(dfs)['time'].sort_values().map(format_time)
for i, df in enumerate(dfs):
fig.add_trace(go.Scatter(x=time_column, y=df['X'], mode='lines', name=f'x{i}'))
fig.add_trace(go.Scatter(x=time_column, y=df['Y'], mode='lines', name=f'y{i}'))
fig.update_layout(
title='Coordinates by time',
xaxis_title='time',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=20)
fig.show(renderer='notebook_connected')