Source code for pasteur.extras.datasets.adult

from __future__ import annotations

from typing import TYPE_CHECKING

from ....dataset import TabularDataset
from ....utils import get_relative_fn, RawSource

import logging

if TYPE_CHECKING:
    import pandas as pd

logger = logging.getLogger(__name__)


[docs]class AdultDataset(TabularDataset): name = "adult" deps = {"table": ["train", "test"]} folder_name = "adult" catalog = get_relative_fn("catalog.yml") raw_sources = RawSource(["https://archive.ics.uci.edu/static/public/2/adult.zip"])
[docs] def bootstrap(self, raw: str, dst: str): from zipfile import ZipFile import os os.makedirs(dst, exist_ok=True) with ZipFile(os.path.join(raw, "adult.zip"), "r") as zf: logger.info(f"Extracting adult.zip...") zf.extractall(dst)
def _process_chunk(self, tables: dict[str, pd.DataFrame]): import pandas as pd train = tables["train"] test = tables["test"].assign( income=tables["test"]["income"].cat.rename_categories( {"<=50K.": "<=50K", ">50K.": ">50K"} ) ) return pd.concat([train, test]).astype({"native-country": "category"})