Time duplication analysis for the Java-format data¶
File reading¶
In [867]:
import logging
import pandas as pd
files_prefix = f'../../data'
filename = '2024-04-14_14-16-31_mouse-java.txt'
filepath = f'{files_prefix}/{filename}'
comment_mark = '#'
separator = '\\s+'
df = pd.read_csv(filepath, sep=separator, comment=comment_mark)
Reading start time¶
Start time is expected in the format without spaces
In [868]:
import os
from datetime import datetime as dt
start_time = None
time_header = 'Start time'
print('Data header:')
with open(filepath, 'r') as file:
for row in file:
values = row.split(' ')
if time_header in row:
start_time = pd.to_datetime(row.split(' ')[-1])
if row.strip() == '':
break
print(row.strip(comment_mark + '\n'))
if start_time is None:
creation_time = dt.utcfromtimestamp(os.path.getctime(filepath))
start_time = pd.to_datetime(creation_time)
logging.warning("The start time value is not present in the header. The file creation date is selected")
Data header: Tablet rate 250 Screen dimension width=1920 height=1080 Tablet size width=344.16 height=193.59 Event period 0 Start time 2024-04-14T11:16:27.379Z Fields description absoluteTime The absolute time when the event occurred (hardware-depends) registeredTime The time when the event occurred (elapsed from the start time) scheduledTime The time when the event is provided by the library (elapsed from the start time) registered-scheduled deltas The time difference between adjacent events for both timestamps x-y The coordinates at this time tiltX-tiltY The tilts in the respective axes pressure The pen pressure (z-coordinate)
Initial data displaying¶
In [869]:
print(f'The start time is {start_time}')
display(df)
The start time is 2024-04-14 11:16:27.379000
absoluteTime | registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1713093388110 | 0:0:0.731 | 0:0:0.732 | 0:0:0.000 | 0:0:0.000 | 62.37900 | -53.59575 | 0.0 | 0.0 | 0.0 |
1 | 1713093388115 | 0:0:0.736 | 0:0:0.736 | 0:0:0.005 | 0:0:0.004 | 62.37900 | -53.41650 | 0.0 | 0.0 | 0.0 |
2 | 1713093388118 | 0:0:0.739 | 0:0:0.739 | 0:0:0.003 | 0:0:0.003 | 62.37900 | -53.23725 | 0.0 | 0.0 | 0.0 |
3 | 1713093388136 | 0:0:0.757 | 0:0:0.758 | 0:0:0.018 | 0:0:0.019 | 62.37900 | -52.52025 | 0.0 | 0.0 | 0.0 |
4 | 1713093388141 | 0:0:0.762 | 0:0:0.762 | 0:0:0.005 | 0:0:0.004 | 62.37900 | -52.16175 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1004 | 1713093390730 | 0:0:3.351 | 0:0:3.352 | 0:0:0.002 | 0:0:0.002 | 92.31375 | -54.31275 | 0.0 | 0.0 | 0.0 |
1005 | 1713093390733 | 0:0:3.354 | 0:0:3.354 | 0:0:0.003 | 0:0:0.002 | 92.31375 | -54.13350 | 0.0 | 0.0 | 0.0 |
1006 | 1713093390735 | 0:0:3.356 | 0:0:3.356 | 0:0:0.002 | 0:0:0.002 | 92.13450 | -53.95425 | 0.0 | 0.0 | 0.0 |
1007 | 1713093390738 | 0:0:3.359 | 0:0:3.360 | 0:0:0.003 | 0:0:0.004 | 92.13450 | -53.59575 | 0.0 | 0.0 | 0.0 |
1008 | 1713093390740 | 0:0:3.361 | 0:0:3.363 | 0:0:0.002 | 0:0:0.003 | 91.95525 | -53.41650 | 0.0 | 0.0 | 0.0 |
1009 rows × 10 columns
Converting time columns to the Pandas time¶
In [870]:
time_from_start_columns = ['registeredTime', 'scheduledTime']
delta_columns = ['registeredDelta', 'scheduledDelta']
for column in time_from_start_columns:
try:
df[column] = pd.to_timedelta(df[column])
# To get an absolute time
# df[column] = pd.to_timedelta(df[column]) + start_time
except ValueError: # i.e. negative values
df[column] = df[column]
for column in delta_columns:
df[column] = pd.to_timedelta(df[column]).dt.total_seconds() * 1000 # ms
Fixing negative time shift if necessary¶
The time of the registered point is received from the tablet as the number of milliseconds elapsed from an unknown starting point. On the other hand, the scheduled time is represented by UNIX
To calculate this unknown start time, it is assumed that there is the same difference between the first registered point and its start time as there is between the first scheduled point and the system start time (which may not always be true, especially if the library frequency is limited).
However, the delta between adjacent points remains unchanged, and these transformations are more cosmetic for the correct appearance of the data
In [871]:
import datetime
def calculate_register_start_time():
start_time_dt = start_time.to_pydatetime()
first_registered_str = df.loc[0, 'registeredTime']
first_scheduled_dt = df.loc[0, 'scheduledTime'].to_pytimedelta()
register_delta = convert_to_timedelta(first_registered_str)
register_start_time = start_time_dt + register_delta - first_scheduled_dt
return register_start_time
def recalculate_registered_time(registered_time: str, tablet_start_time: pd.Timedelta) -> pd.Timedelta:
delta = pd.to_timedelta(convert_to_timedelta(registered_time))
return (start_time + delta) - tablet_start_time
def convert_to_timedelta(delta: str) -> datetime.timedelta:
"""
Parses a time delta in the H:M:S.ms format
"""
h, m, s_ms = delta.split(':')
s, ms = s_ms.split('.')
h, m, s, ms = map(lambda t: int(t), [h, m, s, ms])
return datetime.timedelta(hours=h, minutes=m, seconds=s, milliseconds=ms)
try:
pd.to_timedelta(df['registeredTime'])
except ValueError:
tablet_start_time = pd.to_datetime(calculate_register_start_time())
df['registeredTime'] = df['registeredTime'].map(lambda t: recalculate_registered_time(t, tablet_start_time))
display(df)
absoluteTime | registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1713093388110 | 0 days 00:00:00.731000 | 0 days 00:00:00.732000 | 0.0 | 0.0 | 62.37900 | -53.59575 | 0.0 | 0.0 | 0.0 |
1 | 1713093388115 | 0 days 00:00:00.736000 | 0 days 00:00:00.736000 | 5.0 | 4.0 | 62.37900 | -53.41650 | 0.0 | 0.0 | 0.0 |
2 | 1713093388118 | 0 days 00:00:00.739000 | 0 days 00:00:00.739000 | 3.0 | 3.0 | 62.37900 | -53.23725 | 0.0 | 0.0 | 0.0 |
3 | 1713093388136 | 0 days 00:00:00.757000 | 0 days 00:00:00.758000 | 18.0 | 19.0 | 62.37900 | -52.52025 | 0.0 | 0.0 | 0.0 |
4 | 1713093388141 | 0 days 00:00:00.762000 | 0 days 00:00:00.762000 | 5.0 | 4.0 | 62.37900 | -52.16175 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1004 | 1713093390730 | 0 days 00:00:03.351000 | 0 days 00:00:03.352000 | 2.0 | 2.0 | 92.31375 | -54.31275 | 0.0 | 0.0 | 0.0 |
1005 | 1713093390733 | 0 days 00:00:03.354000 | 0 days 00:00:03.354000 | 3.0 | 2.0 | 92.31375 | -54.13350 | 0.0 | 0.0 | 0.0 |
1006 | 1713093390735 | 0 days 00:00:03.356000 | 0 days 00:00:03.356000 | 2.0 | 2.0 | 92.13450 | -53.95425 | 0.0 | 0.0 | 0.0 |
1007 | 1713093390738 | 0 days 00:00:03.359000 | 0 days 00:00:03.360000 | 3.0 | 4.0 | 92.13450 | -53.59575 | 0.0 | 0.0 | 0.0 |
1008 | 1713093390740 | 0 days 00:00:03.361000 | 0 days 00:00:03.363000 | 2.0 | 3.0 | 91.95525 | -53.41650 | 0.0 | 0.0 | 0.0 |
1009 rows × 10 columns
Data statistics¶
In [872]:
display(df.describe())
absoluteTime | registeredTime | scheduledTime | registeredDelta | scheduledDelta | x | y | tiltX | tiltY | pressure | |
---|---|---|---|---|---|---|---|---|---|---|
count | 1.009000e+03 | 1009 | 1009 | 1009.000000 | 1009.000000 | 1009.000000 | 1009.000000 | 1009.0 | 1009.0 | 1009.0 |
mean | 1.713093e+12 | 0 days 00:00:02.140176412 | 0 days 00:00:02.140847373 | 2.606541 | 2.607532 | 62.041463 | -55.624526 | 0.0 | 0.0 | 0.0 |
std | 7.135426e+02 | 0 days 00:00:00.713542608 | 0 days 00:00:00.713649934 | 1.642321 | 1.819914 | 16.712767 | 14.748513 | 0.0 | 0.0 | 0.0 |
min | 1.713093e+12 | 0 days 00:00:00.731000 | 0 days 00:00:00.732000 | 0.000000 | 0.000000 | 31.906500 | -85.323000 | 0.0 | 0.0 | 0.0 |
25% | 1.713093e+12 | 0 days 00:00:01.553000 | 0 days 00:00:01.553000 | 2.000000 | 2.000000 | 49.293750 | -66.501750 | 0.0 | 0.0 | 0.0 |
50% | 1.713093e+12 | 0 days 00:00:02.140000 | 0 days 00:00:02.141000 | 2.000000 | 2.000000 | 61.841250 | -56.105250 | 0.0 | 0.0 | 0.0 |
75% | 1.713093e+12 | 0 days 00:00:02.764000 | 0 days 00:00:02.764000 | 3.000000 | 3.000000 | 74.926500 | -44.274750 | 0.0 | 0.0 | 0.0 |
max | 1.713093e+12 | 0 days 00:00:03.361000 | 0 days 00:00:03.363000 | 18.000000 | 19.000000 | 92.493000 | -26.887500 | 0.0 | 0.0 | 0.0 |
In [873]:
counted_by_registered = df['registeredTime'].value_counts()
display(counted_by_registered.describe())
display(counted_by_registered.value_counts().sort_index())
count 995.00000 mean 1.01407 std 0.11784 min 1.00000 25% 1.00000 50% 1.00000 75% 1.00000 max 2.00000 Name: count, dtype: float64
count 1 981 2 14 Name: count, dtype: int64
Scheduled time¶
In [874]:
counted_by_scheduled = df['scheduledTime'].value_counts()
display(counted_by_scheduled.describe())
display(counted_by_scheduled.value_counts().sort_index())
count 882.000000 mean 1.143991 std 0.351280 min 1.000000 25% 1.000000 50% 1.000000 75% 1.000000 max 2.000000 Name: count, dtype: float64
count 1 755 2 127 Name: count, dtype: int64
Calculating frequency¶
In [875]:
def calculate_frequency(time_column: str, without_duplicates: bool) -> float:
timestamps = df[time_column]
if without_duplicates:
timestamps = df.drop_duplicates(subset=time_column, keep='last')[time_column]
first_time = timestamps.iloc[0]
last_time = timestamps.iloc[-1]
duration = (last_time - first_time).total_seconds()
return duration / (len(timestamps) - 1)
By registered time¶
In [876]:
rt_frequency_s = calculate_frequency('registeredTime', without_duplicates=False)
rt_frequency_hz = 1 / rt_frequency_s
print(f"Register frequency: {rt_frequency_s} s or {rt_frequency_hz} Hz")
rt_frequency_without_duplicates_s = calculate_frequency('registeredTime', without_duplicates=True)
rt_frequency_without_duplicates_hz = 1 / rt_frequency_without_duplicates_s
print(f"Without duplicates: {rt_frequency_without_duplicates_s} s or {rt_frequency_without_duplicates_hz} Hz")
rt_max_period = df.loc[1:, 'registeredDelta'].max()
print(f"Register max period: {rt_max_period / 1000} s")
Register frequency: 0.002609126984126984 s or 383.2699619771863 Hz Without duplicates: 0.0026458752515090543 s or 377.9467680608365 Hz Register max period: 0.018 s
By scheduled time¶
In [877]:
st_frequency_s = calculate_frequency('scheduledTime', without_duplicates=False)
st_frequency_hz = 1 / st_frequency_s
print(f"Schedule frequency: {st_frequency_s} s or {st_frequency_hz} Hz")
st_frequency_without_duplicates_s = calculate_frequency('scheduledTime', without_duplicates=True)
st_frequency_without_duplicates_hz = 1 / st_frequency_without_duplicates_s
print(f"Without duplicates: {st_frequency_without_duplicates_s} s or {st_frequency_without_duplicates_hz} Hz")
st_max_period = df.loc[1:, 'scheduledDelta'].max()
print(f"Schedule max period: {st_max_period / 1000} s")
Schedule frequency: 0.0026101190476190477 s or 383.1242873432155 Hz Without duplicates: 0.002986379114642452 s or 334.8536678069175 Hz Schedule max period: 0.019 s
Displaying of time series¶
In [878]:
import plotly.graph_objects as go
def display_time_series(selected_time_column: str):
time_column = df[selected_time_column].map(format_time)
fig = go.Figure()
fig.add_trace(go.Scatter(x=time_column, y=df.x, mode='lines', name='x'))
fig.add_trace(go.Scatter(x=time_column, y=df.y, mode='lines', name='y'))
fig.update_layout(
title='Coordinates by time',
xaxis_title=f'{selected_time_column} (m:s.ms)',
yaxis_title='coordinates'
)
fig.update_xaxes(dtick=20, tickangle=30)
fig.show(renderer='notebook_connected')
def format_time(td: pd.Timedelta):
return '{:02d}:{:06.3f}'.format(int(td.total_seconds() // 60), td.total_seconds() % 60)
By registered time¶
In [879]:
display_time_series('registeredTime')
By scheduled time¶
In [880]:
display_time_series('scheduledTime')