Time duplication analysis for the data from Wintab diagnostics tool¶
In [107]:
import pandas as pd
def clean_strings(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy = data.rename(columns=lambda h: h.strip())
# drop single-string rows, i.e. prox
copy = copy.dropna(axis=0, how='any')
return copy
def convert_to_timedelta(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
try:
copy['time'] = pd.to_timedelta(copy['time'], unit='ms')
except ValueError: # i.e. negative values
copy['time'] = copy['time']
return copy
def skip_trailing_1s(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
ft, lt = copy['time'].min(), copy['time'].max()
s = pd.Timedelta(seconds=1)
ft1s, lt1s = ft + s, lt - s
return copy[(copy['time'] >= ft1s) & (copy['time'] <= lt1s)]
Reading¶
In [108]:
from os.path import join, isfile
from os import listdir
import pandas as pd
files_prefix = f'../../data'
folder = 'wtcapt-set/bamboo_l_r'
set_path = f'{files_prefix}/{folder}'
separator = ','
rows_to_skip = 50
dfs = []
for f in listdir(set_path):
filepath = join(set_path, f)
if isfile(filepath):
data = pd.read_csv(filepath, sep=separator, encoding='utf-16', skiprows=rows_to_skip, on_bad_lines='warn')
data = clean_strings(data)
data = convert_to_timedelta(data)
data = skip_trailing_1s(data)
dfs.append(data)
Sizes¶
In [109]:
print("The size of each dataframe: ", [df.index.size for df in dfs])
The size of each dataframe: [326, 232, 298, 415, 286, 321, 294, 280, 191, 676]
Aggregated statistics of the data¶
In [110]:
def get_statistics(df: pd.DataFrame) -> pd.DataFrame:
copy = df.copy()
copy['time'] = copy['time'].dt.total_seconds()
return copy.describe()
statistics = [get_statistics(df) for df in dfs]
concatenated_statistics = (pd.concat(statistics)
.reset_index()
.groupby('index')
.median())
display(concatenated_statistics)
status | time | serial | csr | bttn | X | Y | npres | azi | alt | |
---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||
25% | 0.0 | 1.726331e+06 | 209.750000 | 1.0 | 1.0 | 5927.000000 | 2222.625000 | 554.000000 | 0.0 | 900.0 |
50% | 0.0 | 1.726332e+06 | 283.500000 | 1.0 | 1.0 | 8491.000000 | 4555.250000 | 579.000000 | 0.0 | 900.0 |
75% | 0.0 | 1.726333e+06 | 357.250000 | 1.0 | 1.0 | 10620.125000 | 6789.125000 | 605.500000 | 0.0 | 900.0 |
count | 296.0 | 2.960000e+02 | 296.000000 | 296.0 | 296.0 | 296.000000 | 296.000000 | 296.000000 | 296.0 | 296.0 |
max | 0.0 | 1.726334e+06 | 431.000000 | 1.0 | 1.0 | 13464.500000 | 8736.000000 | 652.500000 | 0.0 | 900.0 |
mean | 0.0 | 1.726332e+06 | 283.500000 | 1.0 | 1.0 | 8466.813510 | 4393.209940 | 578.685779 | 0.0 | 900.0 |
min | 0.0 | 1.726330e+06 | 136.000000 | 1.0 | 1.0 | 4169.000000 | 91.000000 | 505.500000 | 0.0 | 900.0 |
std | 0.0 | 6.422156e-01 | 85.592056 | 0.0 | 0.0 | 2821.533471 | 2513.585786 | 39.087907 | 0.0 | 0.0 |
Grouping by time¶
In [111]:
statistics_by_time = []
counted_by_time = []
for data in dfs:
count = data['time'].value_counts()
statistics_by_time.append(count.describe())
counted_by_time.append(count.value_counts().sort_index())
Aggregated statistics of the number of elements for each group¶
In [112]:
display(pd.concat(statistics_by_time, axis=1).median(axis=1))
display(pd.concat(counted_by_time, axis=1).median(axis=1))
count 296.0 mean 1.0 std 0.0 min 1.0 25% 1.0 50% 1.0 75% 1.0 max 1.0 dtype: float64
count 1 296.0 dtype: float64
Calculating the frequency¶
In [113]:
import numpy as np
def calculate_frequency(df: pd.DataFrame, without_duplicates: bool) -> float:
timestamps = df['time']
if without_duplicates:
timestamps = df.drop_duplicates(subset='time', keep='last')['time']
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
frequency_s = np.array(
[calculate_frequency(df, without_duplicates=False) for df in dfs])
median_frequency_s = np.median(frequency_s)
median_frequency_hz = 1 / median_frequency_s
print(f"Median register frequency: {median_frequency_s} s or {median_frequency_hz} Hz")
frequency_s_without_duplicates = np.array(
[calculate_frequency(df, without_duplicates=True) for df in dfs])
median_frequency_s_without_duplicates = np.median(frequency_s_without_duplicates)
median_frequency_hz_without_duplicates = 1 / median_frequency_s_without_duplicates
print(f"Without duplicates: {median_frequency_s_without_duplicates} s "
f"or {median_frequency_hz_without_duplicates} Hz")
concatenated_periods = pd.concat([df['time'].diff() for df in dfs])
max_period = concatenated_periods.max().total_seconds()
print(f"Max period: {max_period} s")
Median register frequency: 0.007503870192307692 s or 133.26456540054653 Hz Without duplicates: 0.007503870192307692 s or 133.26456540054653 Hz Max period: 0.008 s
Displaying of time series with this data¶
In [114]:
import plotly.graph_objects as go
def format_time(td: pd.Timedelta):
return '0:{:06.3f}'.format(td.total_seconds() % 60)
fig = go.Figure()
time_column = pd.concat(dfs)['time'].sort_values().map(format_time)
for i, df in enumerate(dfs):
fig.add_trace(go.Scatter(x=time_column, y=df['X'], mode='lines', name=f'x{i}'))
fig.add_trace(go.Scatter(x=time_column, y=df['Y'], mode='lines', name=f'y{i}'))
fig.update_layout(
title='Coordinates by time',
xaxis_title='time',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=20)
fig.show(renderer='notebook_connected')