Source code for xplainable.preprocessing.transformers.dataset

""" Copyright Xplainable Pty Ltd, 2023"""

from .base import XBaseTransformer

import pandas.api.types as pdtypes
import pandas as pd
import numpy as np
import pandas as pd
import scipy.signal as ss


[docs]class DropCols(XBaseTransformer): """ Drops specified columns from a dataset. Args: columns (str): The columns to be dropped. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, columns=None): super().__init__() self.columns = columns def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params(columns=xwidgets.SelectMultiple(options=df.columns)): self.columns = list(columns) return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Drops specified columns from a dataset. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() # Apply column dropping for c in self.columns: if c in df.columns: df = df.drop(columns=[c]) return df
[docs]class DropNaNs(XBaseTransformer): """ Drops nan rows from a dataset. Args: subset (list, optional): A subset of columns to apply the transfomer. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, subset=None): super().__init__() self.subset = subset def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( subset=xwidgets.SelectMultiple(options=[None]+list(df.columns))): self.subset = list(subset) return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Drops nan rows from a dataset. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ subset = list(df.columns) if self.subset is None else self.subset subset = [i for i in subset if i in df.columns] return df.copy().dropna(subset=subset)
[docs]class Operation(XBaseTransformer): """Applies operation to multiple columns (in order) into new feature. Args: columns (list): Column names to add. alias (str): Name of newly created column. drop (bool): Drops original columns if True """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, columns=[], operation=None, alias: str = None, drop: bool = False): super().__init__() self.columns = columns self.operation = operation self.alias = alias if alias else " + ".join([c for c in columns]) self.drop = drop def __call__(self, dataset, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets from ...utils import xwidgets cols = list(dataset.columns) def _set_params( columns_to_apply=xwidgets.SelectMultiple( description='Columns: ', value=[cols[0]], options=cols, allow_duplicates=False), tags = xwidgets.TagsInput( value=[cols[0]], allowed_tags=cols ), operation=['add', 'multiply', 'average', 'concatenate'], alias=xwidgets.Text(''), drop_columns=xwidgets.Checkbox(value=True)): self.columns = list(columns_to_apply) self.operation = operation self.alias = alias self.drop = drop_columns widget = interactive(_set_params) dd = widget.children[0] tags = widget.children[1] widget.children[3].layout = widgets.Layout(margin='10px 0 20px 0') widgets.link((dd, 'value'), (tags, 'value')) label = widgets.HTML('Drag to reorder') label.layout = widgets.layout = widgets.Layout(margin='10px 0 0 0') tags_display = widgets.VBox([dd, label, tags]) tags_display.layout = widgets.Layout( margin='0 0 0 30px', width='280px' ) widget.children = (tags_display,) + widget.children[2:] return widget
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Applies operation to multiple columns (in order) into new feature. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() if self.operation == 'add': if not all([pdtypes.is_numeric_dtype(df[col]) for col in self.columns]): raise TypeError("Cannot add string and numeric columns") df[self.alias] = df[self.columns].sum(axis=1) elif self.operation == 'multiply': if not all([pdtypes.is_numeric_dtype(df[col]) for col in self.columns]): raise TypeError("Cannot add string and numeric columns") df[self.alias] = df[self.columns].apply(np.prod, axis=1) elif self.operation == 'average': if not all([pdtypes.is_numeric_dtype(df[col]) for col in self.columns]): raise TypeError("Cannot add string and numeric columns") df[self.alias] = df[self.columns].mean(axis=1) elif self.operation == 'concatenate': for col in self.columns: if not pdtypes.is_string_dtype(df[col]): df[col] = df[col].astype(str) df[self.alias] = df[self.columns].agg('-'.join, axis=1) if self.drop: df = df.drop(columns=self.columns) return df
[docs]class TextTrimMulti(XBaseTransformer): """ Drops or keeps first/last n characters of a categorical column. Args: selector (str): [first, last]. n (int): Number of characters to identify. action (str): [keep, drop] the identified characters. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__( self, column='', selector=None, n=0, action='keep', drop_col=False, alias=''): self.column = column self.selector = selector self.n = n self.action = action self.drop_col = drop_col self.alias = alias def __call__(self, dataset, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets from ...utils import xwidgets cols = list(dataset.columns) def _set_params( column=xwidgets.Dropdown( description='Column: ', value=cols[0], options=cols), selector = widgets.Dropdown(options=["first", "last"]), n = widgets.IntText(n=1), action = widgets.Dropdown(options=['keep', 'drop']), alias = widgets.Text(''), drop_col = widgets.Checkbox(description="drop original") ): self.selector = selector self.n = n self.action = action self.alias = alias self.drop_col = drop_col self.column = column return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Drops or keeps first/last n characters of a categorical column. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() ser = df[self.column] if self.action == 'keep': if self.selector == "first": df[self.alias] = ser.str[:self.n] else: df[self.alias] = ser.str[-self.n:] else: if self.selector == "first": df[self.alias] = ser.str[self.n:] else: df[self.alias] = ser.str[:-self.n] if self.drop_col: df = df.drop(columns=[self.column]) return df
[docs]class ChangeNames(XBaseTransformer): """ Changes names of columns in a dataset Args: col_names (dict): Dictionary of old and new column names. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, col_names={}): super().__init__() self.col_names = col_names def __call__(self, df, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets from ...utils import xwidgets def _set_params(*args, **kwargs): args = locals() self.col_names = dict(args)['kwargs'] col_names = {col: xwidgets.Text(col) for col in df.columns} w = interactive(_set_params, **col_names) for c in w.children[:-1]: c.description = '' return w
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Changes names of columns in a dataset Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() col_names = {i: v for i, v in self.col_names.items() if i in df.columns} return df.rename(columns=col_names)
[docs]class OrderBy(XBaseTransformer): """ Orders the dataset by the values of a given series. Args: order_by (str): The series to order by. ascending (bool): Orders in ascending order if True. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, order_by=None, ascending=True): super().__init__() self.order_by = order_by self.ascending = ascending def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( order_by = xwidgets.SelectMultiple( description='Order by: ', options=[None]+list(df.columns)), direction = xwidgets.ToggleButtons( description='Direction: ', options=['ascending', 'descending'])): self.order_by = list(order_by) self.ascending = True if direction == 'ascending' else False return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Orders the dataset by the values of a given series. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ return df.sort_values(self.order_by, ascending=self.ascending)
[docs]class GroupbyShift(XBaseTransformer): """ Shifts a series up or down n steps within specified group. Args: target (str): The target feature to shift. step (int): The number of steps to shift. as_new (bool): Creates new column if True. group_by (str): The column to group by. order_by (str): The column to order by. descending (bool): Orders the value descending if True. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, columns=None, step=0, as_new=True, col_names=[], \ group_by=None, order_by=None, descending=None): super().__init__() self.columns = columns self.step = step self.as_new = as_new self.col_names = col_names self.group_by = group_by self.order_by = order_by self.descending = descending def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( group_by = xwidgets.SelectMultiple( description='Group by: ', options=[None]+list(df.columns)), order_by = xwidgets.SelectMultiple( description='Order by: ', options=[None]+list(df.columns)), descending = xwidgets.Checkbox(value=False), columns=xwidgets.SelectMultiple( description='Columns: ', options=[None]+list(df.columns)), step = xwidgets.IntText(value=0, min=-1000, max=1000), as_new = xwidgets.Checkbox(value=False) ): self.columns = columns self.group_by = list(group_by) self.order_by = list(order_by) self.descending = descending self.step = step self.as_new = as_new # build new col names if as_new if as_new and len(columns) > 0: for target in columns: col_name = f'{target}_shift_{step}' if len(self.group_by) > 0: col_name += "_gb_" col_name += '_'.join(self.group_by) if len(self.order_by) > 0: col_name += "_ob_" col_name += '_'.join(self.order_by) self.col_names.append(col_name) return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Shifts a series up or down n steps within specified group. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ # Order values if if self.order_by and self.order_by[0] is not None: df = df.sort_values(self.order_by, ascending=not self.descending) for col_name, target in zip(self.col_names, self.columns): if self.group_by and self.group_by[0] is not None: df[col_name] = df.groupby( self.group_by)[target].shift(self.step) else: df[col_name] = df[target].shift(self.step) return df
[docs]class FillMissing(XBaseTransformer): """Fills missing values of all columns with a specified value/strategy.""" # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, fill_with={}, fill_values={}): super().__init__() self.fill_with = fill_with self.fill_values = fill_values def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params(*args, **kwargs): args = locals() self.fill_with = dict(args)['kwargs'] def get_widget(col): if pdtypes.is_numeric_dtype(df[col]): return xwidgets.Dropdown(options=["mean", "median", "mode"]) else: return xwidgets.Text(value='missing') col_xwidgets = {col: get_widget(col) for col in df.columns} return interactive(_set_params, **col_xwidgets)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Fills missing values of all columns with a specified value/strategy. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ for i, v in self.fill_values.items(): if i not in df.columns: continue df[i] = df[i].fillna(v) return df
[docs] def fit(self, df: pd.DataFrame) -> 'FillMissing': """ Calculates the fill_value for all columns in the dataset. The fill values are based on a specified strategy for each column. Args: df (pd.DataFrame): The dataset to fit Returns: FillMissing: The fitted transformer. """ for i, v in self.fill_with.items(): # Calculate fill_value if mean, median or mode if v == 'mean': self.fill_values[i] = round(np.nanmean(df[i]), 4) elif v == 'median': self.fill_values[i] = np.nanmedian(df[i]) elif v == 'mode': self.fill_values[i] = df[i].mode() else: self.fill_values[i] = v # Maintain fill value type (if int) if pdtypes.is_integer_dtype(df[i]): self.fill_values[i] = int(self.fill_values[i]) return self
[docs]class SetDTypes(XBaseTransformer): """Sets the data type of all columns in the dataset. Args: types (dict): Dictionary of column names and data types. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, types={}): super().__init__() self.types = types def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params(*args, **kwargs): args = locals() self.types = dict(args)['kwargs'] def get_widget(col): if pdtypes.is_float_dtype(df[col]): options=["float", "integer", "string"] value = 'float' elif pdtypes.is_integer_dtype(df[col]): options=["float", "integer", "string"] value = 'integer' elif pdtypes.is_string_dtype(df[col]): #options=["string"] options=["float", "integer", "string"] # if all(df[col].str.isdigit()): # options += ["float", "integer"] value = 'string' elif pdtypes.is_datetime64_dtype(df[col]): options = ["date", "string"] value = "date" elif pdtypes.is_bool_dtype(df[col]): options = ["boolean", "string", "integer", "float"] value = "boolean" w = xwidgets.Dropdown( options=options, value=value, style={'description_width': 'initial'} ) return w col_xwidgets = {col: get_widget(col) for col in df.columns} return interactive(_set_params, **col_xwidgets)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Sets the data type of all columns in the dataset. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ for i, v in self.types.items(): if i not in df.columns: continue if v == 'string': df[i] = df[i].astype(str) continue df[i] = pd.to_numeric(df[i], errors='coerce') if v == 'integer': # If missing value are present, cannot cast to int try: df[i] = df[i].astype(int) except Exception: continue return df
[docs]class TextSplit(XBaseTransformer): """ Splits a string column into multiple columns on a specified separator. Args: target (str): The columns to split. separator (str): The separator to split on. max_splits (int): The maximum number of splits to make. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, target=None, separator=None, max_splits=0): self.target = target self.separator = separator self.max_splits = max_splits def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( target=xwidgets.Dropdown( options=[None]+[i for i in df.columns if \ pdtypes.is_string_dtype(df[i])]), separator = xwidgets.Text(value=""), max_splits = xwidgets.IntText(range=[0,10])): self.target = target self.separator = separator self.max_splits = max([max_splits, 0]) return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Splits a string column into multiple columns on a specified separator. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ new_cols = df[self.target].astype(str).str.split( self.separator, expand=True, n=self.max_splits) new_cols.columns = [f'{self.target}_{i}' for i in new_cols] df[new_cols.columns] = new_cols df = df.drop(columns=[self.target]) return df
[docs]class ChangeCases(XBaseTransformer): """ Changes the case of all specified categorical columns. Args: columns (list): To apply the case change to. case (str): 'upper' or 'lower'. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, columns=[], case='lower'): super().__init__() self.columns = columns self.case = case def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( columns = xwidgets.SelectMultiple( description='Columns: ', options=[None]+[i for i in df.columns if \ pdtypes.is_string_dtype(df[i])]), case = xwidgets.Dropdown( description='Case: ', options=["lower", "upper"])): self.columns = list(columns) self.case = case return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Changes the case of all specified categorical columns. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ for col in self.columns: if col not in df.columns: continue if self.case == 'lower': df[col] = df[col].str.lower() elif self.case == 'upper': df[col] = df[col].str.upper() else: raise ValueError("case change must be either 'lower' or 'upper'") return df
[docs]class GroupedSignalSmoothing(XBaseTransformer): """ Smooths signal data within specified group. Args: target (str): The target feature to shift. as_new (bool): Creates new column if True. group_by (str): The column to group by. order_by (str): The column to order by. descending (bool): Orders the value descending if True. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, target=None, group_by=None,\ order_by=None, descending=None): super().__init__() self.target = target self.group_by = group_by self.order_by = order_by self.descending = descending def __call__(self, df, *args, **kwargs): from ipywidgets import interactive from ...utils import xwidgets def _set_params( group_by = xwidgets.SelectMultiple( description='Group by: ', options=[None]+list(df.columns)), order_by = xwidgets.SelectMultiple( description='Order by: ', options=[None]+list(df.columns)), descending = xwidgets.Checkbox(value=False), targets=xwidgets.SelectMultiple(options=[None]+list(df.columns)) ): self.targets = targets self.group_by = list(group_by) self.order_by = list(order_by) self.descending = descending return interactive(_set_params)
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Smooths signal data within specified group. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ # Order values if if self.order_by and self.order_by[0] is not None: df = df.sort_values(self.order_by, ascending=not self.descending) if self.group_by and self.group_by[0] is not None: for t in self.targets: df[t] = df.groupby(self.group_by)[t].transform( lambda x: ss.savgol_filter(x, window_length=19, \ polyorder=1, deriv=0, mode='interp')) else: df[t] = ss.savgol_filter( df[t], window_length=19, polyorder=1, deriv=0, mode='interp') return df
[docs]class DateTimeExtract(XBaseTransformer): """ Extracts Datetime values from datetime object. Args: target (str): The datetime column to extract from. year (bool): Extracts year if True. month (bool): Extracts month if True. day (bool): Extracts day if True. weekday (bool): Extracts weekday if True. day_name (bool): Extracts day name if True. hour (bool): Extracts hour if True. minute (bool): Extracts minute if True. second (bool): Extracts second if True. drop (bool): Drops original datetime column if True. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__(self, target=None, year=False, month=False, day=False, \ weekday=False, day_name=False, hour=False, minute=False, \ second=False, drop=False): self.target = target self.year = year self.month = month self.day = day self.weekday = weekday self.day_name = day_name self.hour = hour self.minute = minute self.second = second self.drop = drop def __call__(self, df, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets def _set_params( target=widgets.Dropdown(options=df.columns), year=widgets.ToggleButton(value=False), month=widgets.ToggleButton(value=False), day=widgets.ToggleButton(value=False), weekday=widgets.ToggleButton(value=False), day_name=widgets.ToggleButton(value=False), hour=widgets.ToggleButton(value=False), minute=widgets.ToggleButton(value=False), second=widgets.ToggleButton(value=False), drop = widgets.Checkbox(value=False) ): self.target = target self.year = year self.month = month self.day = day self.weekday = weekday self.day_name = day_name self.hour = hour self.minute = minute self.second = second self.drop = drop w = interactive(_set_params) _target = w.children[0] left = widgets.VBox(w.children[1:5]) right = widgets.VBox(w.children[5:9]) buttons = widgets.HBox([left, right]) _drop = w.children[9] elements = widgets.VBox([_target, buttons, _drop]) return elements
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Extracts Datetime values from datetime object. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() try: df[self.target] = pd.to_datetime(df[self.target]) except: raise TypeError(f"{self.target} can not be coerced to datetime") if self.year: df[f'{self.target}_year'] = df[self.target].dt.year if self.month: df[f'{self.target}_month'] = df[self.target].dt.month if self.day: df[f'{self.target}_day'] = df[self.target].dt.day if self.weekday: df[f'{self.target}_weekday'] = df[self.target].dt.weekday if self.day_name: df[f'{self.target}_day_name'] = df[self.target].dt.day_name() if self.hour: df[f'{self.target}_hour'] = df[self.target].dt.hour if self.minute: df[f'{self.target}_minute'] = df[self.target].dt.minute if self.second: df[f'{self.target}_second'] = df[self.target].dt.second if self.drop: df = df.drop(columns=[self.target]) return df
[docs]class RollingOperation(XBaseTransformer): """Applies operation to multiple columns (in order) into new feature. Args: columns (list): Column names to add. alias (str): Name of newly created column. drop (bool): Drops original columns if True """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['dataset'] def __init__( self, groupby=None, orderby=None, direction=None, columns=[], window=None, operation=None, drop: bool = False ): super().__init__() self.groupby = groupby self.orderby = orderby self.direction = direction self.columns = columns self.window = window self.operation = operation self.drop = drop def __call__(self, df, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets from ...utils import xwidgets cols = list(df.columns) def _set_params( group_by = xwidgets.SelectMultiple( description='Group by: ', options=[None]+list(df.columns)), order_by = xwidgets.SelectMultiple( description='Order by: ', options=[None]+list(df.columns)), direction = xwidgets.ToggleButtons( description='Direction: ', options=['ascending', 'descending']), columns_to_apply=xwidgets.SelectMultiple( description='Columns: ', value=[cols[0]], options=cols, allow_duplicates=False), window=xwidgets.IntText(min=2, value=3), operation=['mean', 'sum', 'max', 'min'], drop_columns=xwidgets.Checkbox(value=True)): self.groupby = list(group_by) self.orderby = list(order_by) self.direction = direction self.columns = list(columns_to_apply) self.operation = operation self.window = window self.drop = drop_columns widget = interactive(_set_params) widget.children[6].layout = widgets.Layout(margin='10px 0 20px 0') widget.layout = widgets.Layout( margin='0 0 0 30px', width='280px' ) return widget
[docs] def transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Applies operation to multiple columns (in order) into new feature. Args: df (pd.DataFrame): The dataset to transform. Returns: pd.DataFrame: The transformed dataset. """ df = df.copy() assert all( [pdtypes.is_numeric_dtype(df[col]) for col in self.columns] ), "Selected columns must be numeric" asc = True if self.direction == 'ascending' else False for col in self.columns: alias = f'{col}_{self.operation}_{self.window}' if self.operation == 'mean': if self.groupby: df[alias] = df.sort_values( self.orderby, ascending=asc).groupby(self.groupby).rolling( self.window).mean()[col].values else: df[alias] = df[col].rolling(self.window).mean() elif self.operation == 'sum': if self.groupby: df[alias] = df.sort_values( self.orderby, ascending=asc).groupby(self.groupby).rolling( self.window).sum()[col].values else: df[alias] = df[col].rolling(self.window).sum() elif self.operation == 'max': if self.groupby: df[alias] = df.sort_values( self.orderby, ascending=asc).groupby(self.groupby).rolling( self.window).max()[col].values else: df[alias] = df[col].rolling(self.window).max() elif self.operation == 'min': if self.groupby: df[alias] = df.sort_values( self.orderby, ascending=asc).groupby(self.groupby).rolling( self.window).min()[col].values else: df[alias] = df[col].rolling(self.window).min() if self.drop: df = df.drop(columns=self.columns) return df