Source code for pylife.stress.timesignal

# Copyright (c) 2019-2023 - for information on the respective copyright owner
# see the NOTICE file and/or the repository
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

"""A module for time signal handling


This module is not considered finalized even though it is part of pylife-2.0.
Breaking changes might occur in upcoming minor releases.

__author__ = "Johannes Mueller"
__maintainer__ = __author__

import numpy as np
import pandas as pd
import scipy.stats as stats
import scipy.signal as signal

from matplotlib.mlab import psd

    import tsfresh as ts
    _HAVE_TSFRESH = True
except ModuleNotFoundError:
    _HAVE_TSFRESH = False

[docs] class TimeSignalGenerator: r"""Generates mixed time signals The generated time signal is a mixture of random sets of sinus signals For each set the user supplys a dict describing the set:: sinus_set = { 'number': number of signals 'amplitude_median': 'amplitude_std_dev': 'frequency_median': 'frequency_std_dev': 'offset_median': 'offset_std_dev': } The amplitudes (:math:`A`), fequencies (:math:`\omega`) and offsets (:math:`c`) are then norm distributed. Each sinus signal looks like :math:`s = A \sin(\omega t + \phi) + c` where :math:`phi` is a random value between 0 and :math:`2\pi`. So the whole sinus :math:`S` set is given by the following expression: :math:`S = \sum^n_i A_i \sin(\omega_i t + \phi_i) + c_i`. """ def __init__(self, sample_rate, sine_set, gauss_set, log_gauss_set): sine_amplitudes = stats.norm.rvs(loc=sine_set['amplitude_median'], scale=sine_set['amplitude_std_dev'], size=sine_set['number']) sine_frequencies = stats.norm.rvs(loc=sine_set['frequency_median'], scale=sine_set['frequency_std_dev'], size=sine_set['number']) sine_offsets = stats.norm.rvs(loc=sine_set['offset_median'], scale=sine_set['offset_std_dev'], size=sine_set['number']) sine_phases = 2. * np.pi * np.random.rand(sine_set['number']) self.sine_set = list( zip(sine_amplitudes, sine_frequencies, sine_phases, sine_offsets)) self.sample_rate = sample_rate self.time_position = 0.0
[docs] def query(self, sample_num): """Gets a sample chunk of the time signal Parameters ---------- sample_num : int number of the samples requested Returns ------- samples : 1D numpy.ndarray the requested samples You can query multiple times, the newly delivered samples will smoothly attach to the previously queried ones. """ samples = np.zeros(sample_num) end_time_position = self.time_position + \ (sample_num-1) / self.sample_rate for ampl, omega, phi, offset in self.sine_set: periods = np.floor(self.time_position / omega) start = self.time_position - periods * omega end = end_time_position - periods * omega time = np.linspace(start, end, sample_num) samples += ampl * np.sin(omega * time + phi) + offset self.time_position = end_time_position + 1. / self.sample_rate return samples
[docs] def reset(self): """ Resets the generator A resetted generator behaves like a new generator. """ self.time_position = 0.0
[docs] def fs_calc(df): """ Calculates the sample frequency of a DataFrame time series Parameters ---------- df : DataFrame time series. Returns ------- fs : int, float sample freqency """ try: fs = 1/np.mean(np.diff(df.index)) except TypeError: print("Index has to be a number not a string. We assume fs = 1") fs = 1 return fs
[docs] def resample_acc(df, fs=1): """ Resamples a pandas time series DataFrame Parameters ---------- df: DataFrame time_col: str column name of the time column fs: float sample rate of the resampled time series Returns ------- DataFrame """ index_new = np.arange(df.index.min(), df.index.max() + 1/fs, 1/fs) df_rs = pd.DataFrame(df.apply(lambda x: np.interp(index_new, df.index, x)).values, index=index_new, columns=df.columns) return df_rs
[docs] def butter_bandpass(df, lowcut, highcut, order=5): """ Use the functonality of scipy Parameters ---------- df: DataFrame lowcut : float low frequency highcut : float high freqency. order : int, optional Butterworth filter order. The default is 5. Returns ------- TSout : DataFrame """ fs = fs_calc(df) nyq = 0.5 * fs low = lowcut / nyq high = highcut / nyq b, a = signal.butter(order, [low, high], btype='bandpass') return df.apply(lambda x: signal.filtfilt(b, a, x, padlen=int(fs/2)))
[docs] def psd_df(df_ts, NFFT=512): """ calculates the psd using Welch algorithm from matplotlib functionality Parameters ---------- df_ts : DataFram time series dataframe NFFT : int, optional BufferSize. The default is 512. Returns ------- df_psd : DataFrame PSD. """ fs = fs_calc(df_ts) df_psd = pd.DataFrame() for col in df_ts: df_psd[col], freq = psd(df_ts[col], Fs=fs,NFFT = NFFT) df_psd.index = pd.Index(freq, name="frequency") return df_psd
def _prepare_rolling(df): """ Adds ID, time to the dataset for TsFresh, We would need different ID's if we had independant timeseries -like timeseries for different robots. Parameters ---------- df: pandas DataFrame input data self : TimeSignalPrep class Returns ------- df : pandas DataFrame output DataFrame with added id, time """ prep_roll = df.copy() prep_roll["id"] = 0 prep_roll["time"] = df.index.values prep_roll["time"] = prep_roll["time"].subtract(prep_roll["time"].values[0]) prep_roll.index = prep_roll["time"] return prep_roll def _roll_dataset(prep_roll_df, window_size=1000, overlap=200): """ Rolls dataset in windows so we can later extract features from every window Parameters ---------- prep_roll: output from prepare_rolling window_size : int , optional window size of the rolled segments -the default is 1000. overlap : int, optional overlap between 2 adjecent windows -The default is 200. Returns ------- df_rolled : pandas DataFrame rolled DataFrame """ # Create Rolled Dataset with Parameter rolling_direction & window_size # throws away the last halfshift rolling_direction = window_size - overlap cycles = int((len(prep_roll_df)-window_size) / rolling_direction)+1 parts = [] # shiften for i in range(cycles): position = (rolling_direction) * i shift = prep_roll_df.iloc[position: position + window_size, :].copy() # change IDs to format (id,time) df = pd.DataFrame({'id': np.int64(np.zeros(len(shift), dtype=int)), 'max_time': shift.iloc[-1, -1]}) shift['id'] = pd.MultiIndex.from_frame(df).to_numpy() parts.append(shift) return pd.concat(parts, ignore_index=True) def _extract_feature_df(df_rolled, feature="maximum"): """Extracts features like "abs_energy" or "maximum" from the rolled dataset with TsFresh Parameters ---------- df_rolled : pandas DataFrame rolled DataFrame from roll_dataset feature : string, optional Extracted feature - only supports one at a time - and only features form tsfresh that dont need extra parameters. The default is "maximum". Returns ------- extracted_features : pandas DataFrame Dataframe of extracted features """ # extract features # fc_parameters = {"abs_energy", "maximum"} fc_parameters = { feature: None, } extracted_features = ts.extract_features( df_rolled, column_id="id", column_sort="time", default_fc_parameters=fc_parameters, n_jobs=0, ) extracted_features.index = range(len(extracted_features)) return extracted_features def _select_relevant_windows(prep_roll, extracted_features, comparison_column_ex, fraction_max=0.25, window_size=1000, overlap=200, n_gridpoints=3, method="keep"): """ Writes n_gridpoints NaN's into the window_sizes with extracted features lower than fraction_max Parameters ---------- prep_roll : pandas DataFrame input data - normally output from perpare_rolling(df) extracted_features : pandas Dataframe DataFrame of features comparison_column_ex: string - name of the extraced feature column it is build: comparison_column + '__' + feauture fraction_max : float percentage of the maximum of the extraced feature. window_size : int window size of the rolled segments -the default is 1000. overlap : int, optional overlap between 2 adjecent windows -The default is 200. Returns ------- df : pandas DataFrame relevant_windows dataframe with NaN's in the windows with too low extracted features """ # get added up abs energy of interval x, if too low set None rolling_direction = window_size - overlap relevant_feature = extracted_features[comparison_column_ex] relevant_windows = prep_roll.copy() just_added_NaNs = False liste = [] for i in range(len(extracted_features)): if relevant_feature[i] <= relevant_feature.max() * fraction_max: if just_added_NaNs is True: liste.append(list(range(0 + i * rolling_direction, window_size + i * rolling_direction))) else: liste.append(list(range(0 + i * rolling_direction, window_size + i * rolling_direction - n_gridpoints))) relevant_windows.iloc[i * rolling_direction + window_size - n_gridpoints:i * rolling_direction + window_size, 0:relevant_windows.shape[1]-2] = None just_added_NaNs = True else: just_added_NaNs = False index_liste = [] """ tail = (len(prep_roll)-window_size) % rolling_direction+1 for i in range(tail): liste.append(len(prep_roll)-i-1) """ liste = list(pd.core.common.flatten(liste)) liste = list(set(liste)) for i in range(len(liste)): index_liste.append(relevant_windows.index[liste[i]]) if method == "keep": relevant_windows = relevant_windows.drop(index_liste, axis=0) elif method == "remove": relevant_windows = relevant_windows.loc[index_liste] return relevant_windows def _polyfit_gridpoints(grid_points, prep_roll, order=3, verbose=False, n_gridpoints=3): """Fills gridpoints with polynomial regression Parameters ---------- gridpoints : pandas DataFrame DataFrame with NaN's as gridpoints prep_roll : pandas DataFrame used to create time axis. DataFrame used to create time axis. order : int, optional Order of polynom The default is 3. verbose : boolean, optional If true plots polyfits. The default is False. n_gridpoints : TYPE, optional Number of gridpoints. The default is 3. Returns ------- df : pandas DataFrame DataFrame with polynomial values at the gridpoints. """ # add a null row at the start and reset time index delta_t = prep_roll.index[1]-prep_roll.index[0] line = pd.DataFrame(grid_points.iloc[:1], index=[- delta_t]) grid_points = pd.concat([grid_points, line], ignore_index=False) grid_points.index = grid_points.index + delta_t poly_gridpoints = grid_points.sort_index() poly_gridpoints.iloc[0, :] = 0 ts_time = prep_roll.iloc[:len(poly_gridpoints)] poly_gridpoints["time"] = ts_time.index.values poly_gridpoints.index = poly_gridpoints["time"] # %% smooth the gaps with polynomial values poly_gridpoints.interpolate(method='polynomial', order=order, inplace=True) return poly_gridpoints
[docs] def clean_timeseries(df, comparison_column, window_size=1000, overlap=800, feature="abs_energy", method="keep", n_gridpoints=3, percentage_max=0.05, order=3): """ Removes segments of the data in which the extracted feature value is lower as percentage_max and fills the gaps with polynomial regression Parameters ---------- df : input pandas DataFrame that shall be cleaned comparison_column: str, column that is used for the feature comparison with percentage max window_size : int, optional window size of the rolled segments - The default is 1000. overlap : int, optional overlap between 2 adjecent windows -The default is 200. feature : string, optional extracted feature - only supports one at a time - and only features form tsfresh that dont need extra parameters. The default is "maximum". method: string, optional * 'keep': keeps the windows which are extracted, * 'remove': removes the windows which are extracted n_gridpoints : TYPE, optional number of gridpoints. The default is 3. percentage_max : float, optional min percentage of the maximum to keep the window. The default is 0.05. order : int, optional order of polynom The default is 3. Returns ------- df_poly : pandas DataFrame cleaned DataFrame """ if not _HAVE_TSFRESH: raise ImportError("tsfresh and dependencies are not installed. " "Use `pip install pylife[tsfresh]` to install it.") df_prep = _prepare_rolling(df) ts_time = df_prep.copy() # adding a row delta_t = ts_time.index[1]-ts_time.index[0] line = pd.DataFrame(ts_time.iloc[:1], index=[- delta_t]) ts_time = pd.concat([ts_time, line], ignore_index=False) ts_time.index = ts_time.index + delta_t ts_time = ts_time.sort_index() ts_time['time'] = ts_time.index.values comparison_column_ex = comparison_column + '__'+feature df_rolled = _roll_dataset(df_prep, window_size=window_size, overlap=overlap) extracted_features = _extract_feature_df(df_rolled, feature) grid_points = _select_relevant_windows(df_prep, extracted_features, comparison_column_ex, percentage_max, window_size, overlap, method=method) poly_gridpoints = _polyfit_gridpoints(grid_points, ts_time, order=order, verbose=False, n_gridpoints=n_gridpoints) # Remove NaN's at the end - should be maximum 2n cleaned = poly_gridpoints.dropna(axis=0, how='any') cleaned.pop("id") return cleaned