import logging

import pandas as pd

files_prefix = f'../../data'
filename = '2024-04-14_14-16-31_mouse-java.txt'
filepath = f'{files_prefix}/{filename}'

comment_mark = '#'
separator = '\\s+'

df = pd.read_csv(filepath, sep=separator, comment=comment_mark)

import os
from datetime import datetime as dt

start_time = None
time_header = 'Start time'

print('Data header:')
with open(filepath, 'r') as file:
    for row in file:
        values = row.split(' ')
        if time_header in row:
            start_time = pd.to_datetime(row.split(' ')[-1])

        if row.strip() == '':
            break
        print(row.strip(comment_mark + '\n'))

if start_time is None:
    creation_time = dt.utcfromtimestamp(os.path.getctime(filepath))
    start_time = pd.to_datetime(creation_time)

    logging.warning("The start time value is not present in the header. The file creation date is selected")

Data header:
 Tablet rate 250
 Screen dimension width=1920 height=1080
 Tablet size width=344.16 height=193.59
 Event period 0
 Start time 2024-04-14T11:16:27.379Z

 Fields description
 absoluteTime The absolute time when the event occurred (hardware-depends)
 registeredTime The time when the event occurred (elapsed from the start time)
 scheduledTime The time when the event is provided by the library (elapsed from the start time)
 registered-scheduled deltas The time difference between adjacent events for both timestamps
 x-y The coordinates at this time
 tiltX-tiltY The tilts in the respective axes
 pressure The pen pressure (z-coordinate)

print(f'The start time is {start_time}')

display(df)

The start time is 2024-04-14 11:16:27.379000

time_from_start_columns = ['registeredTime', 'scheduledTime']
delta_columns = ['registeredDelta', 'scheduledDelta']

for column in time_from_start_columns:
    try:
        df[column] = pd.to_timedelta(df[column])

        # To get an absolute time
        # df[column] = pd.to_timedelta(df[column]) + start_time
    except ValueError:  # i.e. negative values
        df[column] = df[column]

for column in delta_columns:
    df[column] = pd.to_timedelta(df[column]).dt.total_seconds() * 1000  # ms

import datetime


def calculate_register_start_time():
    start_time_dt = start_time.to_pydatetime()
    first_registered_str = df.loc[0, 'registeredTime']
    first_scheduled_dt = df.loc[0, 'scheduledTime'].to_pytimedelta()

    register_delta = convert_to_timedelta(first_registered_str)
    register_start_time = start_time_dt + register_delta - first_scheduled_dt

    return register_start_time


def recalculate_registered_time(registered_time: str, tablet_start_time: pd.Timedelta) -> pd.Timedelta:
    delta = pd.to_timedelta(convert_to_timedelta(registered_time))

    return (start_time + delta) - tablet_start_time


def convert_to_timedelta(delta: str) -> datetime.timedelta:
    """
    Parses a time delta in the H:M:S.ms format
    """

    h, m, s_ms = delta.split(':')
    s, ms = s_ms.split('.')
    h, m, s, ms = map(lambda t: int(t), [h, m, s, ms])

    return datetime.timedelta(hours=h, minutes=m, seconds=s, milliseconds=ms)


try:
    pd.to_timedelta(df['registeredTime'])
except ValueError:
    tablet_start_time = pd.to_datetime(calculate_register_start_time())
    df['registeredTime'] = df['registeredTime'].map(lambda t: recalculate_registered_time(t, tablet_start_time))

display(df)

display(df.describe())

counted_by_registered = df['registeredTime'].value_counts()
display(counted_by_registered.describe())

display(counted_by_registered.value_counts().sort_index())

count    995.00000
mean       1.01407
std        0.11784
min        1.00000
25%        1.00000
50%        1.00000
75%        1.00000
max        2.00000
Name: count, dtype: float64

count
1    981
2     14
Name: count, dtype: int64

counted_by_scheduled = df['scheduledTime'].value_counts()
display(counted_by_scheduled.describe())

display(counted_by_scheduled.value_counts().sort_index())

count    882.000000
mean       1.143991
std        0.351280
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: count, dtype: float64

count
1    755
2    127
Name: count, dtype: int64

def calculate_frequency(time_column: str, without_duplicates: bool) -> float:
    timestamps = df[time_column]
    if without_duplicates:
        timestamps = df.drop_duplicates(subset=time_column, keep='last')[time_column]

    first_time = timestamps.iloc[0]
    last_time = timestamps.iloc[-1]
    duration = (last_time - first_time).total_seconds()

    return duration / (len(timestamps) - 1)

rt_frequency_s = calculate_frequency('registeredTime', without_duplicates=False)
rt_frequency_hz = 1 / rt_frequency_s
print(f"Register frequency: {rt_frequency_s} s or {rt_frequency_hz} Hz")

rt_frequency_without_duplicates_s = calculate_frequency('registeredTime', without_duplicates=True)
rt_frequency_without_duplicates_hz = 1 / rt_frequency_without_duplicates_s
print(f"Without duplicates: {rt_frequency_without_duplicates_s} s or {rt_frequency_without_duplicates_hz} Hz")

rt_max_period = df.loc[1:, 'registeredDelta'].max()
print(f"Register max period: {rt_max_period / 1000} s")

Register frequency: 0.002609126984126984 s or 383.2699619771863 Hz
Without duplicates: 0.0026458752515090543 s or 377.9467680608365 Hz
Register max period: 0.018 s

st_frequency_s = calculate_frequency('scheduledTime', without_duplicates=False)
st_frequency_hz = 1 / st_frequency_s
print(f"Schedule frequency: {st_frequency_s} s or {st_frequency_hz} Hz")

st_frequency_without_duplicates_s = calculate_frequency('scheduledTime', without_duplicates=True)
st_frequency_without_duplicates_hz = 1 / st_frequency_without_duplicates_s
print(f"Without duplicates: {st_frequency_without_duplicates_s} s or {st_frequency_without_duplicates_hz} Hz")

st_max_period = df.loc[1:, 'scheduledDelta'].max()
print(f"Schedule max period: {st_max_period / 1000} s")

Schedule frequency: 0.0026101190476190477 s or 383.1242873432155 Hz
Without duplicates: 0.002986379114642452 s or 334.8536678069175 Hz
Schedule max period: 0.019 s

import plotly.graph_objects as go


def display_time_series(selected_time_column: str):
    time_column = df[selected_time_column].map(format_time)

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=time_column, y=df.x, mode='lines', name='x'))
    fig.add_trace(go.Scatter(x=time_column, y=df.y, mode='lines', name='y'))

    fig.update_layout(
        title='Coordinates by time',
        xaxis_title=f'{selected_time_column} (m:s.ms)',
        yaxis_title='coordinates'
    )

    fig.update_xaxes(dtick=20, tickangle=30)

    fig.show(renderer='notebook_connected')


def format_time(td: pd.Timedelta):
    return '{:02d}:{:06.3f}'.format(int(td.total_seconds() // 60), td.total_seconds() % 60)

display_time_series('registeredTime')

display_time_series('scheduledTime')

	absoluteTime	registeredTime	scheduledTime	registeredDelta	scheduledDelta	x	y	tiltX	tiltY	pressure
0	1713093388110	0:0:0.731	0:0:0.732	0:0:0.000	0:0:0.000	62.37900	-53.59575	0.0	0.0	0.0
1	1713093388115	0:0:0.736	0:0:0.736	0:0:0.005	0:0:0.004	62.37900	-53.41650	0.0	0.0	0.0
2	1713093388118	0:0:0.739	0:0:0.739	0:0:0.003	0:0:0.003	62.37900	-53.23725	0.0	0.0	0.0
3	1713093388136	0:0:0.757	0:0:0.758	0:0:0.018	0:0:0.019	62.37900	-52.52025	0.0	0.0	0.0
4	1713093388141	0:0:0.762	0:0:0.762	0:0:0.005	0:0:0.004	62.37900	-52.16175	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...
1004	1713093390730	0:0:3.351	0:0:3.352	0:0:0.002	0:0:0.002	92.31375	-54.31275	0.0	0.0	0.0
1005	1713093390733	0:0:3.354	0:0:3.354	0:0:0.003	0:0:0.002	92.31375	-54.13350	0.0	0.0	0.0
1006	1713093390735	0:0:3.356	0:0:3.356	0:0:0.002	0:0:0.002	92.13450	-53.95425	0.0	0.0	0.0
1007	1713093390738	0:0:3.359	0:0:3.360	0:0:0.003	0:0:0.004	92.13450	-53.59575	0.0	0.0	0.0
1008	1713093390740	0:0:3.361	0:0:3.363	0:0:0.002	0:0:0.003	91.95525	-53.41650	0.0	0.0	0.0

	absoluteTime	registeredTime	scheduledTime	registeredDelta	scheduledDelta	x	y	tiltX	tiltY	pressure
0	1713093388110	0 days 00:00:00.731000	0 days 00:00:00.732000	0.0	0.0	62.37900	-53.59575	0.0	0.0	0.0
1	1713093388115	0 days 00:00:00.736000	0 days 00:00:00.736000	5.0	4.0	62.37900	-53.41650	0.0	0.0	0.0
2	1713093388118	0 days 00:00:00.739000	0 days 00:00:00.739000	3.0	3.0	62.37900	-53.23725	0.0	0.0	0.0
3	1713093388136	0 days 00:00:00.757000	0 days 00:00:00.758000	18.0	19.0	62.37900	-52.52025	0.0	0.0	0.0
4	1713093388141	0 days 00:00:00.762000	0 days 00:00:00.762000	5.0	4.0	62.37900	-52.16175	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...
1004	1713093390730	0 days 00:00:03.351000	0 days 00:00:03.352000	2.0	2.0	92.31375	-54.31275	0.0	0.0	0.0
1005	1713093390733	0 days 00:00:03.354000	0 days 00:00:03.354000	3.0	2.0	92.31375	-54.13350	0.0	0.0	0.0
1006	1713093390735	0 days 00:00:03.356000	0 days 00:00:03.356000	2.0	2.0	92.13450	-53.95425	0.0	0.0	0.0
1007	1713093390738	0 days 00:00:03.359000	0 days 00:00:03.360000	3.0	4.0	92.13450	-53.59575	0.0	0.0	0.0
1008	1713093390740	0 days 00:00:03.361000	0 days 00:00:03.363000	2.0	3.0	91.95525	-53.41650	0.0	0.0	0.0

	absoluteTime	registeredTime	scheduledTime	registeredDelta	scheduledDelta	x	y	tiltX	tiltY	pressure
count	1.009000e+03	1009	1009	1009.000000	1009.000000	1009.000000	1009.000000	1009.0	1009.0	1009.0
mean	1.713093e+12	0 days 00:00:02.140176412	0 days 00:00:02.140847373	2.606541	2.607532	62.041463	-55.624526	0.0	0.0	0.0
std	7.135426e+02	0 days 00:00:00.713542608	0 days 00:00:00.713649934	1.642321	1.819914	16.712767	14.748513	0.0	0.0	0.0
min	1.713093e+12	0 days 00:00:00.731000	0 days 00:00:00.732000	0.000000	0.000000	31.906500	-85.323000	0.0	0.0	0.0
25%	1.713093e+12	0 days 00:00:01.553000	0 days 00:00:01.553000	2.000000	2.000000	49.293750	-66.501750	0.0	0.0	0.0
50%	1.713093e+12	0 days 00:00:02.140000	0 days 00:00:02.141000	2.000000	2.000000	61.841250	-56.105250	0.0	0.0	0.0
75%	1.713093e+12	0 days 00:00:02.764000	0 days 00:00:02.764000	3.000000	3.000000	74.926500	-44.274750	0.0	0.0	0.0
max	1.713093e+12	0 days 00:00:03.361000	0 days 00:00:03.363000	18.000000	19.000000	92.493000	-26.887500	0.0	0.0	0.0

Time duplication analysis for the Java-format data¶

File reading¶

Reading start time¶

Initial data displaying¶

Converting time columns to the Pandas time¶

Fixing negative time shift if necessary¶

Data statistics¶

Grouping by time¶

Registered time¶

Scheduled time¶

Calculating frequency¶

By registered time¶

By scheduled time¶

Displaying of time series¶

By registered time¶

By scheduled time¶