Source code for pasteur.transform
""" Contains the definition for Transformer and ReferenceTransformer modules. """
import logging
import pandas as pd
from .module import ModuleClass, ModuleFactory
from .attribute import Attribute, Attributes, SeqValue
logger = logging.getLogger(__name__)
[docs]
class Transformer(ModuleClass):
_factory = TransformerFactory
deterministic = True
"For a given output, the input is the same."
lossless = True
"The decoded output equals the input."
stateful = False
"Transformer fits variables."
def __init__(self, **_) -> None:
pass
[docs]
def fit_transform(self, data: pd.Series | pd.DataFrame) -> pd.DataFrame:
self.fit(data)
return self.transform(data)
[docs]
def transform(self, data: pd.Series | pd.DataFrame) -> pd.DataFrame:
raise NotImplementedError()
[docs]
class RefTransformer(Transformer):
"""Reference Transformers use a reference column as an input to create their embeddings.
They can be used to integrate constraints (and domain knowledge) into embeddings,
in such a way that all embeddings produce valid solutions and learning is
easier.
For example, consider an end date embedding that references a start date.
The embedding will form a stable histogram with much less entropy, based
on the period length.
In addition, provided that the embedding is forced to be positive, any value
it takes will produce a valid solution."""
[docs]
def fit(
self,
data: pd.Series | pd.DataFrame,
ref: pd.Series | pd.DataFrame | None = None,
):
pass
[docs]
def fit_transform(
self,
data: pd.Series | pd.DataFrame,
ref: pd.Series | pd.DataFrame | None = None,
) -> pd.DataFrame:
self.fit(data, ref)
return self.transform(data, ref)
[docs]
def transform(
self,
data: pd.Series | pd.DataFrame,
ref: pd.Series | pd.DataFrame | None = None,
) -> pd.DataFrame:
raise NotImplementedError()
[docs]
def reverse(
self, data: pd.DataFrame, ref: pd.Series | pd.DataFrame | None = None
) -> pd.DataFrame:
"""When reversing, the data column contains encoded data, whereas the ref
column contains decoded/original data. Therefore, the referred columns have
to be decoded first."""
raise NotImplementedError()
[docs]
class SeqTransformer(Transformer):
"""Sequence Transformers are a generalised version of Reference Transformers
that can be used to process event data.
Sequence Transformers receive unprocessed parent columns, references and the ID table.
Then, it is up to them to process the data and return the encoded version.
They can also push columns upstream to parents, through context tables.
Event-based data is sequential. The Sequential transformers may require the
order of each row. For this case, the main Sequence Transformer, which is named
the sequencer, is processed first and returns an additional data column and
attribute during fitting. This column and attribute are fed to the other
sequence transformers.
"""
[docs]
def fit(
self,
table: str,
data: pd.Series | pd.DataFrame,
ref: dict[str, pd.DataFrame] | None = None,
ids: pd.DataFrame | None = None,
seq_val: SeqValue | None = None,
seq: pd.Series | None = None,
) -> tuple[SeqValue, pd.Series] | None:
pass
[docs]
def get_attributes(self) -> tuple[Attributes, dict[str, Attributes]]:
raise NotImplementedError()
[docs]
def fit_transform(
self,
table: str,
data: pd.Series | pd.DataFrame,
ref: dict[str, pd.DataFrame] | None = None,
ids: pd.DataFrame | None = None,
seq_val: SeqValue | None = None,
seq: pd.Series | None = None,
) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]] | tuple[
pd.DataFrame, dict[str, pd.DataFrame], pd.Series
]:
self.fit(table, data, ref, ids, seq_val, seq)
return self.transform(data, ref, ids, seq)
[docs]
def transform(
self,
data: pd.Series | pd.DataFrame,
ref: dict[str, pd.DataFrame] | None = None,
ids: pd.DataFrame | None = None,
seq: pd.Series | None = None,
) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]] | tuple[
pd.DataFrame, dict[str, pd.DataFrame], pd.Series
]:
raise NotImplementedError()
[docs]
def reverse(
self,
data: pd.DataFrame,
ctx: dict[str, pd.DataFrame],
ref: dict[str, pd.DataFrame] | None = None,
ids: pd.DataFrame | None = None,
) -> pd.DataFrame:
"""When reversing, the data column contains encoded data, whereas the ref
column contains decoded/original data. Therefore, the referred columns have
to be decoded first."""
raise NotImplementedError()