Time duplication analysis for the Java-format data¶
File reading¶
In [839]:
import logging
import pandas as pd
files_prefix = f'../../data'
filename = '2024-04-08_22-46-16_huion-java.txt'
filepath = f'{files_prefix}/{filename}'
comment_mark = '#'
separator = '\\s+'
df = pd.read_csv(filepath, sep=separator, comment=comment_mark)
Reading start time¶
Start time is expected in the format without spaces
In [840]:
import os
from datetime import datetime as dt
start_time = None
time_header = 'Start time'
print('Data header:')
with open(filepath, 'r') as file:
for row in file:
values = row.split(' ')
if time_header in row:
start_time = pd.to_datetime(row.split(' ')[-1])
if row.strip() == '':
break
print(row.strip(comment_mark + '\n'))
if start_time is None:
creation_time = dt.utcfromtimestamp(os.path.getctime(filepath))
start_time = pd.to_datetime(creation_time)
logging.warning("The start time value is not present in the header. The file creation date is selected")
WARNING:root:The start time value is not present in the header. The file creation date is selected
Data header: Tablet rate 250 Screen dimension width=1920 height=1080 Tablet size width=344.16 height=193.59 Period 25
Initial data displaying¶
In [841]:
print(f'The start time is {start_time}')
display(df)
The start time is 2024-04-13 08:31:42.816981
registeredTime | scheduledTime | registeredDelta | scheduledDelta | availableMs | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
0 | -475696:-28:-39.741 | 0:0:0.819 | 0:0:0.000 | 0:0:0.000 | 25 | 12.90600 | -19.717503 | 0.147280 | 0.055411 | 0.124405 |
1 | -475696:-28:-39.741 | 0:0:0.819 | 0:0:0.000 | 0:0:0.000 | 25 | 12.54750 | -19.896755 | 0.147280 | 0.055411 | 0.183128 |
2 | -475696:-28:-39.741 | 0:0:0.835 | 0:0:0.000 | 0:0:0.016 | 25 | 12.18900 | -20.075992 | 0.148434 | 0.093169 | 0.226590 |
3 | -475696:-28:-39.756 | 0:0:0.835 | 0:0:0.015 | 0:0:0.000 | 25 | 12.00975 | -20.255245 | 0.168399 | 0.093960 | 0.251740 |
4 | -475696:-28:-39.756 | 0:0:0.835 | 0:0:0.000 | 0:0:0.000 | 25 | 11.65125 | -20.613750 | 0.160511 | 0.107156 | 0.269198 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
921 | -475696:-28:-35.912 | 0:0:4.993 | 0:0:0.000 | 0:0:0.011 | 25 | 144.47550 | -79.407750 | 0.291711 | 0.166940 | 0.364058 |
922 | -475696:-28:-35.912 | 0:0:4.993 | 0:0:0.000 | 0:0:0.000 | 25 | 146.44725 | -79.407750 | 0.282978 | 0.143975 | 0.248077 |
923 | -475696:-28:-35.928 | 0:0:4.993 | 0:0:0.016 | 0:0:0.000 | 25 | 148.23975 | -79.228500 | 0.263660 | 0.142558 | 0.143938 |
924 | -475696:-28:-35.928 | 0:0:5.004 | 0:0:0.000 | 0:0:0.011 | 25 | 149.67375 | -79.049245 | 0.270305 | 0.128495 | 0.059822 |
925 | -475696:-28:-35.928 | 0:0:5.004 | 0:0:0.000 | 0:0:0.000 | 25 | 150.92850 | -78.869995 | 0.251198 | 0.127254 | 0.007081 |
926 rows × 10 columns
Converting time columns to the Pandas time¶
In [842]:
time_from_start_columns = ['registeredTime', 'scheduledTime']
delta_columns = ['registeredDelta', 'scheduledDelta']
for column in time_from_start_columns:
try:
df[column] = pd.to_timedelta(df[column])
# To get an absolute time
# df[column] = pd.to_timedelta(df[column]) + start_time
except ValueError: # i.e. negative values
df[column] = df[column]
for column in delta_columns:
df[column] = pd.to_timedelta(df[column]).dt.total_seconds() * 1000 # ms
Fixing negative time shift if necessary¶
The time of the registered point is received from the tablet as the number of milliseconds elapsed from an unknown starting point. On the other hand, the scheduled time is represented by UNIX
To calculate this unknown start time, it is assumed that there is the same difference between the first registered point and its start time as there is between the first scheduled point and the system start time (which may not always be true, especially if the library frequency is limited).
However, the delta between adjacent points remains unchanged, and these transformations are more cosmetic for the correct appearance of the data
In [843]:
import datetime
def calculate_register_start_time():
start_time_dt = start_time.to_pydatetime()
first_registered_str = df.loc[0, 'registeredTime']
first_scheduled_dt = df.loc[0, 'scheduledTime'].to_pytimedelta()
register_delta = convert_to_timedelta(first_registered_str)
register_start_time = start_time_dt + register_delta - first_scheduled_dt
return register_start_time
def recalculate_registered_time(registered_time: str, tablet_start_time: pd.Timedelta) -> pd.Timedelta:
delta = pd.to_timedelta(convert_to_timedelta(registered_time))
return (start_time + delta) - tablet_start_time
def convert_to_timedelta(delta: str) -> datetime.timedelta:
"""
Parses a time delta in the H:M:S.ms format
"""
h, m, s_ms = delta.split(':')
s, ms = s_ms.split('.')
h, m, s, ms = map(lambda t: int(t), [h, m, s, ms])
return datetime.timedelta(hours=h, minutes=m, seconds=s, milliseconds=ms)
try:
pd.to_timedelta(df['registeredTime'])
except ValueError:
tablet_start_time = pd.to_datetime(calculate_register_start_time())
df['registeredTime'] = df['registeredTime'].map(lambda t: recalculate_registered_time(t, tablet_start_time))
display(df)
registeredTime | scheduledTime | registeredDelta | scheduledDelta | availableMs | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 days 00:00:00.819000 | 0 days 00:00:00.819000 | 0.0 | 0.0 | 25 | 12.90600 | -19.717503 | 0.147280 | 0.055411 | 0.124405 |
1 | 0 days 00:00:00.819000 | 0 days 00:00:00.819000 | 0.0 | 0.0 | 25 | 12.54750 | -19.896755 | 0.147280 | 0.055411 | 0.183128 |
2 | 0 days 00:00:00.819000 | 0 days 00:00:00.835000 | 0.0 | 16.0 | 25 | 12.18900 | -20.075992 | 0.148434 | 0.093169 | 0.226590 |
3 | 0 days 00:00:00.834000 | 0 days 00:00:00.835000 | 15.0 | 0.0 | 25 | 12.00975 | -20.255245 | 0.168399 | 0.093960 | 0.251740 |
4 | 0 days 00:00:00.834000 | 0 days 00:00:00.835000 | 0.0 | 0.0 | 25 | 11.65125 | -20.613750 | 0.160511 | 0.107156 | 0.269198 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
921 | 0 days 00:00:04.990000 | 0 days 00:00:04.993000 | 0.0 | 11.0 | 25 | 144.47550 | -79.407750 | 0.291711 | 0.166940 | 0.364058 |
922 | 0 days 00:00:04.990000 | 0 days 00:00:04.993000 | 0.0 | 0.0 | 25 | 146.44725 | -79.407750 | 0.282978 | 0.143975 | 0.248077 |
923 | 0 days 00:00:05.006000 | 0 days 00:00:04.993000 | 16.0 | 0.0 | 25 | 148.23975 | -79.228500 | 0.263660 | 0.142558 | 0.143938 |
924 | 0 days 00:00:05.006000 | 0 days 00:00:05.004000 | 0.0 | 11.0 | 25 | 149.67375 | -79.049245 | 0.270305 | 0.128495 | 0.059822 |
925 | 0 days 00:00:05.006000 | 0 days 00:00:05.004000 | 0.0 | 0.0 | 25 | 150.92850 | -78.869995 | 0.251198 | 0.127254 | 0.007081 |
926 rows × 10 columns
Data statistics¶
In [844]:
display(df.describe())
registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|
count | 926 | 926 | 926.000000 | 926.000000 | 926.000000 | 926.000000 | 926.000000 | 926.000000 | 926.000000 |
mean | 0 days 00:00:02.913481641 | 0 days 00:00:02.913210583 | 4.521598 | 4.519438 | 89.091508 | -45.652613 | 0.279436 | 0.209649 | 0.618491 |
std | 0 days 00:00:01.210993335 | 0 days 00:00:01.210882965 | 7.093588 | 5.513928 | 47.337068 | 19.867742 | 0.072402 | 0.060057 | 0.145330 |
min | 0 days 00:00:00.819000 | 0 days 00:00:00.819000 | 0.000000 | 0.000000 | 9.500250 | -86.040000 | 0.127694 | 0.055411 | 0.007081 |
25% | 0 days 00:00:01.865000 | 0 days 00:00:01.862750 | 0.000000 | 0.000000 | 48.980063 | -60.945000 | 0.231788 | 0.169633 | 0.487761 |
50% | 0 days 00:00:02.912000 | 0 days 00:00:02.912000 | 0.000000 | 0.000000 | 88.728750 | -46.604995 | 0.284107 | 0.219967 | 0.684959 |
75% | 0 days 00:00:03.959000 | 0 days 00:00:03.962000 | 15.000000 | 11.000000 | 129.149625 | -28.142242 | 0.342257 | 0.257720 | 0.739684 |
max | 0 days 00:00:05.006000 | 0 days 00:00:05.004000 | 16.000000 | 23.000000 | 174.051750 | -13.622999 | 0.404596 | 0.316523 | 0.791845 |
In [845]:
counted_by_registered = df['registeredTime'].value_counts()
display(counted_by_registered.describe())
display(counted_by_registered.value_counts().sort_index())
count 269.000000 mean 3.442379 std 0.554348 min 1.000000 25% 3.000000 50% 3.000000 75% 4.000000 max 6.000000 Name: count, dtype: float64
count 1 1 2 1 3 148 4 117 5 1 6 1 Name: count, dtype: int64
Scheduled time¶
In [846]:
counted_by_scheduled = df['scheduledTime'].value_counts()
display(counted_by_scheduled.describe())
display(counted_by_scheduled.value_counts().sort_index())
count 389.000000 mean 2.380463 std 0.608555 min 1.000000 25% 2.000000 50% 2.000000 75% 3.000000 max 5.000000 Name: count, dtype: float64
count 1 12 2 229 3 138 4 8 5 2 Name: count, dtype: int64
Calculating frequency¶
In [847]:
def calculate_frequency(time_column: str, without_duplicates: bool) -> float:
timestamps = df[time_column]
if without_duplicates:
timestamps = df.drop_duplicates(subset=time_column, keep='last')[time_column]
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
By registered time¶
In [848]:
rt_frequency_s = calculate_frequency('registeredTime', without_duplicates=False)
rt_frequency_hz = 1 / rt_frequency_s
print(f"Register frequency: {rt_frequency_s} s or {rt_frequency_hz} Hz")
rt_frequency_without_duplicates_s = calculate_frequency('registeredTime', without_duplicates=True)
rt_frequency_without_duplicates_hz = 1 / rt_frequency_without_duplicates_s
print(f"Without duplicates: {rt_frequency_without_duplicates_s} s or {rt_frequency_without_duplicates_hz} Hz")
rt_max_period = df.loc[1:, 'registeredDelta'].max()
print(f"Register max period: {rt_max_period / 1000} s")
Register frequency: 0.004526486486486487 s or 220.92190112252206 Hz Without duplicates: 0.01562313432835821 s or 64.0076427036064 Hz Register max period: 0.016 s
By scheduled time¶
In [849]:
st_frequency_s = calculate_frequency('scheduledTime', without_duplicates=False)
st_frequency_hz = 1 / st_frequency_s
print(f"Schedule frequency: {st_frequency_s} s or {st_frequency_hz} Hz")
st_frequency_without_duplicates_s = calculate_frequency('scheduledTime', without_duplicates=True)
st_frequency_without_duplicates_hz = 1 / st_frequency_without_duplicates_s
print(f"Without duplicates: {st_frequency_without_duplicates_s} s or {st_frequency_without_duplicates_hz} Hz")
st_max_period = df.loc[1:, 'scheduledDelta'].max()
print(f"Schedule max period: {st_max_period / 1000} s")
Schedule frequency: 0.004524324324324324 s or 221.02747909199525 Hz Without duplicates: 0.010786082474226803 s or 92.7120669056153 Hz Schedule max period: 0.023 s
Displaying of time series¶
In [850]:
import plotly.graph_objects as go
def display_time_series(selected_time_column: str):
time_column = df[selected_time_column].map(format_time)
fig = go.Figure()
fig.add_trace(go.Scatter(x=time_column, y=df.x, mode='lines', name='x'))
fig.add_trace(go.Scatter(x=time_column, y=df.y, mode='lines', name='y'))
fig.update_layout(
title='Coordinates by time',
xaxis_title=f'{selected_time_column} (m:s.ms)',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=20, tickangle=30)
fig.show(renderer='notebook_connected')
def format_time(td: pd.Timedelta):
return '{:02d}:{:06.3f}'.format(int(td.total_seconds() // 60), td.total_seconds() % 60)
By registered time¶
In [851]:
display_time_series('registeredTime')
By scheduled time¶
In [852]:
display_time_series('scheduledTime')