Source code for pasteur.extras.datasets.pad
from __future__ import annotations
from typing import TYPE_CHECKING, Callable, cast
import numpy as np
import pandas as pd
from ....dataset import Dataset
from ....utils import (
LazyChunk,
LazyFrame,
gen_closure,
get_relative_fn,
to_chunked,
)
import logging
logger = logging.getLogger(__name__)
[docs]
class PadDataset(Dataset):
def __init__(self, n_partitions: int = 5, **_) -> None:
super().__init__(**_)
self._n_partitions = n_partitions
name = "pad"
deps = {}
key_deps = ["pad1"]
folder_name = "pad"
catalog = get_relative_fn("catalog.yml")
[docs]
def bootstrap(self, location: str, bootstrap: str):
import os
os.makedirs(bootstrap, exist_ok=True)
logger.info("Using pyreadr to extract dataframes.")
import pyreadr
dfs = pyreadr.read_r(os.path.join(location, "pad.rda"))
for k, v in dfs.items():
out_fn = os.path.join(bootstrap, f"{k}.csv")
logger.info(f"Writing '{k}' to '{out_fn}'")
v.to_csv(out_fn, index=False)
[docs]
def ingest(self, name, **tables: LazyFrame | Callable[[],]):
pass
[docs]
@to_chunked
def keys(self, **tables: LazyChunk):
return tables["pad1"]()[[]]