Source code for xplainable.preprocessing.pipeline
""" Copyright Xplainable Pty Ltd, 2023"""
from ..utils.exceptions import TransformerError
import pandas as pd
import numpy as np
[docs]class XPipeline:
"""Pipeline builder for xplainable transformers.
Args:
stages (list): list containing xplainable pipeline stages.
"""
def __init__(self):
self.stages = []
[docs] def add_stages(self, stages: list) -> 'XPipeline':
""" Adds multiple stages to the pipeline.
Args:
stages (list): list containing xplainable pipeline stages.
Returns:
XPipeline: self
"""
# Error handling
if len(stages) == 0:
raise ValueError('You must include at least one pipeline stage.')
for stage in stages:
# Searches for transformer funtion in stage class
if 'transform' not in [f for f in dir(stage['transformer'])]:
raise TypeError(f'{type(stage)} type is not supported.')
# Searches for dataset level transformers
stage['feature'] = stage.get('feature', None)
if stage['feature'] is None:
stage['feature'] = '__dataset__'
stage['name'] = stage['transformer'].__class__.__name__
self.stages.append(stage)
return self
[docs] def drop_stage(self, stage: int) -> 'XPipeline':
""" Drops a stage from the pipeline.
Args:
stage (int): index of the stage to drop.
Returns:
XPipeline: self
"""
if len(self.stages) == 0:
raise ValueError(f"There are no stages for in the pipeline.")
if stage > len(self.stages) - 1:
raise IndexError(f"Index {stage} out of bounds")
self.stages.pop(stage)
return self
[docs] def fit(self, x: pd.DataFrame) -> 'XPipeline':
""" Sequentially iterates through pipeline stages and fits data.
Args:
x (pd.DataFrame): A non-empty DataFrame to fit.
Returns:
XPipeline: The fitted pipeline.
"""
x = x.copy()
for i, stage in enumerate(self.stages):
if stage['feature'] == '__dataset__':
continue
# Check for features that have appeared before
prev_feature_transformers = [s for s in self.stages[:i] if s['feature'] == stage["feature"]]
# Apply previous transformation if appeared before (for chaining)
if len(prev_feature_transformers) > 0:
tf = prev_feature_transformers[-1]['transformer']
x[stage['feature']] = tf.transform(x[stage['feature']])
# Fit data to transformer
stage['transformer'].fit(x[stage['feature']])
return self
[docs] def transform(self, x: pd.DataFrame):
""" Iterates through pipeline stages applying transformations.
Args:
x (pd.DataFrame): A non-empty DataFrame to transform.
Returns:
pd.DataFrame: The transformed dataframe.
"""
x = x.copy()
# Apply all transformers to dataset
for stage in self.stages:
try:
if stage['feature'] == '__dataset__':
x = stage['transformer'].transform(x)
continue
if stage['feature'] not in x.columns:
continue
x[stage['feature']] = stage['transformer'].transform(x[stage['feature']])
except Exception:
tf_name = stage['transformer'].__class__.__name__
raise TransformerError(
f"Transformer {tf_name} for feature {stage['feature']} failed. Ensure the datatypes are compatible") from None
return x
[docs] def fit_transform(self, x: pd.DataFrame, start: int = 0):
""" Runs the fit method followed by the transform method.
Args:
x (pd.DataFrame): A non-empty DataFrame to fit.
start (int): index of the stage to start fitting from.
Returns:
pd.DataFrame: The transformed dataframe.
"""
x = x.copy()
for stage in self.stages[start:]:
try:
if stage['feature'] == '__dataset__':
stage['transformer'].fit(x)
x = stage['transformer'].transform(x)
continue
# Fit data to transformer
stage['transformer'].fit(x[stage['feature']])
# Apply transformation for chaining
x[stage['feature']] = stage['transformer'].transform(
x[stage['feature']])
except Exception:
tf_name = stage['transformer'].__class__.__name__
raise TransformerError(
f"Transformer {tf_name} for {stage['feature']} failed. Ensure the datatypes are compatible") from None
return x
[docs] def transform_generator(self, x):
"""transform generator"""
x = x.copy()
for stage in self.stages:
try:
# If the stage is for the entire dataset, transform and yield
if stage['feature'] == '__dataset__':
x = stage['transformer'].transform(x)
yield x
continue
# Apply transformation for a specific feature and yield
x_transformed = x.copy()
x_transformed[stage['feature']] = stage['transformer'].transform(
x[stage['feature']])
yield x_transformed
except Exception:
tf_name = stage['transformer'].__class__.__name__
raise TransformerError(
f"Transformer {tf_name} for {stage['feature']} failed. Ensure the datatypes are compatible") from None
[docs] def inverse_transform(self, x: pd.DataFrame):
""" Iterates through pipeline stages applying inverse transformations.
Args:
x (pd.DataFrame): A non-empty DataFrame to inverse transform.
Returns:
pd.DataFrame: The inverse transformed dataframe.
"""
x = x.copy()
# Apply all transformers to dataset
for stage in self.stages:
try:
if stage['feature'] == '__dataset__':
x = stage['transformer'].inverse_transform(x)
continue
if stage['feature'] not in x.columns:
continue
x[stage['feature']] = stage['transformer'].inverse_transform(x[stage['feature']])
except Exception:
raise TransformerError(
f"Transformer for feature {stage['feature']} failed. Ensure the datatypes are compatible") from None
return x
[docs] def get_blueprint(self):
""" Returns a blueprint of the pipeline.
Returns:
list: A list containing the pipeline blueprint.
"""
blueprint = []
for stage in self.stages:
bstage = {"feature": stage['feature']}
bstage['transformer'] = stage['transformer'].__class__.__name__
bstage['args'] = stage['transformer'].__dict__
blueprint.append(bstage)
return blueprint