Source code for xplainable.preprocessing.transformers.categorical

""" Copyright Xplainable Pty Ltd, 2023"""

from ...utils import stopwords

from .base import XBaseTransformer
from pandas.api.types import is_string_dtype
import re
import numpy as np
import pandas as pd


[docs]class TextRemove(XBaseTransformer): """ Remove specified values from a str type series. This transformer cannot be inverse_transformed and does not require fitting. Args: numbers (bool, optional): Removes numbers from string. characters (bool, optional): Removes characters from string. uppercase (bool, optional): Removes uppercase characters from string. lowercase (bool, optional): Removes lowercase characters from string. special (bool, optional): Removes special characters from string. whitespace (bool, optional): Removes whitespace from string. stopwords (bool, optional): Removes stopwords from string. text (str, optional): Removes specific text match from string. custom_regex (str, optional): Removes matching regex text from string. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, numbers=False, characters=False, uppercase=False, \ lowercase=False, special=False, whitespace=False, stopwords=False, \ text=None, custom_regex=None): super().__init__() self.numbers = numbers self.characters = characters self.uppercase = uppercase self.lowercase = lowercase self.special = special self.whitespace = whitespace self.stopwords = stopwords self.text = text self.custom_regex = custom_regex def __call__(self, *args, **kwargs): import ipywidgets as widgets from ipywidgets import interactive from ...utils import xwidgets def _set_params( numbers=widgets.ToggleButton(value=False), characters=widgets.ToggleButton(value=False), uppercase=widgets.ToggleButton(value=False), lowercase=widgets.ToggleButton(value=False), special=widgets.ToggleButton(value=False), whitespace=widgets.ToggleButton(value=False), stopwords=widgets.ToggleButton(value=False), text=xwidgets.Text(''), regex=xwidgets.Text('') ): self.numbers = numbers self.characters = characters self.uppercase = uppercase self.lowercase = lowercase self.special = special self.whitespace = whitespace self.stopwords = stopwords if text == '': self.text = None else: self.text = text if regex == '': self.custom_regex = None else: self.custom_regex = regex w = interactive(_set_params) left = widgets.VBox(w.children[:4]) right = widgets.VBox(w.children[4:7]) buttons = widgets.HBox([left, right]) text = widgets.VBox( w.children[7:], layout=widgets.Layout(margin='20px 0 0 0')) elements = widgets.VBox([buttons, text]) return elements
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Removes specified values from a str type series. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ matches = [] if self.numbers: matches.append(r'[0-9]') if self.characters: matches.append(r'[a-zA-Z]') else: if self.uppercase: matches.append(r'[A-Z]') if self.lowercase: matches.append(r'[a-z]') if self.special: matches.append(r'[^a-zA-Z0-9 ]') if self.whitespace: matches.append(r' ') if self.custom_regex: matches.append(self.custom_regex) if len(matches) > 0: regex = re.compile("|".join(matches)) ser[~ser.isna()] = ser[~ser.isna()].apply( lambda x: regex.sub('', x)) if self.text: ser = ser.str.replace(self.text, "") if self.stopwords: ser = ser.apply(lambda x: x if type(x) != str else " ".join( [i for i in x.split() if i not in stopwords])) return ser
[docs]class ChangeCase(XBaseTransformer): """ Changes the case of a string. Args: case (str): 'upper' or 'lower' """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, case='lower'): super().__init__() self.case = case def __call__(self, *args, **kwargs): from ipywidgets import interactive def _set_params(case = ["lower", "upper"]): self.case = case return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Changes the case of a string. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ if self.case == 'lower': return ser.apply( lambda x: str(x).lower() if type(x) in [str, bool] else x) if self.case == 'upper': return ser.apply( lambda x: str(x).upper() if type(x) in [str, bool] else x) else: raise ValueError("case change must be either 'lower' or 'upper'")
[docs]class DetectCategories(XBaseTransformer): """ Auto-detects categories from a string column. Args: max_categories (int): The maximum number of categories to extract. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, max_categories=10, category_list=[]): super().__init__() self.max_categories = int(max_categories) self.category_list = category_list def __call__(self, *args, **kwargs): from ipywidgets import interactive def _set_params(max_categories=(2, 50)): self.max_categories = max_categories return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Detects categories from a string column. Args: ser (pd.Series): The series to transform. Raises: TypeError: If the series is not of type string. Returns: pd.Series: The transformed series. """ if not is_string_dtype(ser): raise TypeError(f'Cannot detect categories for non-text field.') cl = self.category_list # Map top categories if exists else 'other' ser = ser.fillna('missing') ser = ser.str.split().apply( lambda x: list( set([y for y in x if y in cl]))[0] if ( len(list(set([y for y in x if y in cl]))) > 0) else 'other') return ser
[docs] def fit(self, ser: pd.Series) -> 'DetectCategories': """ Identifies the top categories from a text series. Args: ser (pandas.Series): The series in which to analyse. Returns: DetectCategories: The fitted transformer. """ # Get the top n categories based on count ser = ser.fillna('missing') self.category_list = list(ser.str.split().explode( ).value_counts().head(self.max_categories).index) return self
[docs]class Condense(XBaseTransformer): """ Condenses a feature into categories that make up x pct of obserations. Args: pct (int): The minumum pct of observations the categories should cover. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, pct=0.8, categories=[]): super().__init__() self.pct = pct self.categories = categories def __call__(self, *args, **kwargs): from ipywidgets import interactive def _set_params(pct=(0, 100)): self.pct = pct / 100 return interactive(_set_params)
[docs] def fit(self, ser: pd.Series) -> 'Condense': """ Determines the categories that make up x pct of obserations. Args: ser (pandas.Series): The series in which to analyse. Raises: TypeError: If the series is not of type string. Returns: Condense: The fitted transformer. """ if not is_string_dtype(ser): raise TypeError(f'Series must be of type string.') # calculate the pct of observations the category makes up vc = ser.value_counts() / len(ser) # Instantiate trackers cum_pct = 0 top_categories = None # Iterate through top categories until minimum pct is reached for i, v in enumerate(vc): cum_pct += v top_categories = i + 1 if cum_pct >= self.pct: break self.categories = list(vc[:top_categories].index) return self
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Condenses a feature into categories that make up x pct of obserations. Args: ser (pd.Series): The series to transform. Raises: ValueError: If the series is not of type string. Returns: pd.Series: The transformed series. """ if not is_string_dtype(ser): raise ValueError(f'Cannot condense categories for non-text field.') # Convert non-top categories to 'other return (ser.isin(self.categories) * ser).replace("", 'other')
[docs]class MergeCategories(XBaseTransformer): """ Merges specified categories in a series into one category. Args: merge_from (list): List of categories to merge from. merge_to (str): The category to merge to. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, merge_from=[], merge_to=''): super().__init__() self.merge_from = merge_from self.merge_to = merge_to def __call__(self, column, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets unq = column.dropna().unique() def _set_params( merge_from=widgets.SelectMultiple(options=unq), merge_to=unq): self.merge_from = list(merge_from) self.merge_to = merge_to return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Merges specified categories in a series into one category. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ # Apply merging return ser.apply(lambda x: self.merge_to if x in self.merge_from else x)
[docs]class ReplaceCategory(XBaseTransformer): """ Replaces a category in a series with specified value. Args: target: The target value to replace. replace_with: The value to insert in place. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, target=None, replace_with=''): super().__init__() self.target = target self.replace_with = replace_with def __call__(self, column, *args, **kwargs): from ipywidgets import interactive unq = column.dropna().unique() def _set_params(target=unq, replace_with=''): self.target = target self.replace_with = replace_with return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Replaces a category in a series with specified value. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ # Apply value replacement return ser.replace(self.target, self.replace_with)
[docs]class FillMissingCategorical(XBaseTransformer): """ Fills missing values with a specified value. Args: fill_with (str): Text to fill with. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, fill_with='missing'): super().__init__() self.fill_with = fill_with def __call__(self, *args, **kwargs): from ipywidgets import interactive def _set_params(fill_with = "missing"): self.fill_with = fill_with return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Fills missing values with a specified value. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ # Converts "" into np.nan to be filled ser = ser.apply(lambda x: np.nan if str(x).strip() == "" else x) return ser.fillna(self.fill_with)
[docs]class MapCategories(XBaseTransformer): """ Maps all categories of a string column to new values""" # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, category_values={}): super().__init__() self.category_values = category_values def __call__(self, ser, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets def _set_params(*args, **kwargs): args = locals() self.category_values = dict(args)['kwargs'] category_values = {i: widgets.Text(i) for i in ser.dropna().unique()} return interactive(_set_params, **category_values)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Maps all categories of a string column to new values Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ return ser.map(self.category_values)
[docs]class TextContains(XBaseTransformer): """ Flags series values that contain, start with, or end with a value. Args: selector (str): The type of search to make. value (str): The value to search. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, selector=None, value=None): self.selector = selector self.value = value def __call__(self, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets def _set_params( selector = widgets.Dropdown( options=["starts with", "ends with", "contains"]), value = ''): self.selector = selector self.value = value return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Flags series values that contain, start with, or end with a value. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ if self.selector == "starts with": return ser.str.startswith(self.value) if self.selector == "ends with": return ser.str.endswith(self.value) else: return ser.str.contains(self.value)
[docs]class TextTrim(XBaseTransformer): """ Drops or keeps first/last n characters of a categorical column. Args: selector (str): [first, last]. n (int): Number of characters to identify. action (str): [keep, drop] the identified characters. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, selector=None, n=0, action='keep'): self.selector = selector self.n = n self.action = action def __call__(self, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets def _set_params( selector = widgets.Dropdown(options=["first", "last"]), n = widgets.IntText(n=1), action = widgets.Dropdown(options=['keep', 'drop']), ): self.selector = selector self.n = n self.action = action return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Drops or keeps first/last n characters of a categorical column. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ if self.action == 'keep': if self.selector == "first": return ser.str[:self.n] else: return ser.str[-self.n:] else: if self.selector == "first": return ser.str[self.n:] else: return ser.str[:-self.n]
[docs]class TextSlice(XBaseTransformer): """ Selects slice from categorical column string. Args: start (int): Starting character. end (int): Ending character. action (str): [keep, drop] selected slice. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, start=None, end=None, action='keep'): self.start = start self.end = end self.action = action def __call__(self, *args, **kwargs): from ipywidgets import interactive import ipywidgets as widgets def _set_params( start = widgets.IntText(idx=0), end = widgets.IntText(idx=0), action = widgets.Dropdown(options=['keep', 'drop']) ): self.start = start self.end = end self.action = action return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Selects slice from categorical column string. Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ if self.action == 'keep': return ser.str[self.start:self.end] else: return ser.str[:self.start] + ser.str[self.end:]
[docs]class ReplaceWith(XBaseTransformer): """ Replaces specified value in series Args: case (str): 'upper' or 'lower' Attributes: case (str): The case the string will convert to. """ # Informs the embedded GUI which data types this transformer supports. supported_types = ['categorical'] def __init__(self, target=None, replace_with=None): super().__init__() self.target = target self.replace_with = replace_with def __call__(self, *args, **kwargs): from ipywidgets import interactive def _set_params(target = '', replace_with = ''): self.target = target self.replace_with = replace_with return interactive(_set_params)
[docs] def transform(self, ser: pd.Series) -> pd.Series: """ Replaces specified value in series Args: ser (pd.Series): The series to transform. Returns: pd.Series: The transformed series. """ return ser.str.replace(self.target, self.replace_with)