Skip to content

Reference

Biobank Dataset class.

Dataset

Biobank Dataset class.

__init__(self) special

Constructor.

Source code in biobank/dataset.py
def __init__(self):
    """Constructor."""
    path = settings.path.absolute()
    self.path = path / self.filename
    self.dictionary = Dictionary(path)

delete(self)

Delete the dataset.

Returns:

Type Description
None

None

Source code in biobank/dataset.py
def delete(self) -> None:
    """Delete the dataset.

    Returns:
        None
    """
    if self.path.is_file():
        self.path.unlink()
    else:
        shutil.rmtree(str(self.path))

import_dataset(self, path, dictionary)

Import a dataset.

Parameters:

Name Type Description Default
path

URL or local path of dataset to import

required

Returns:

Type Description
None

None

Source code in biobank/dataset.py
def import_dataset(self, path, dictionary) -> None:
    """Import a dataset.

    Args:
        path: URL or local path of dataset to import

    Returns:
        None
    """
    self.dictionary.load(dictionary, download=True)
    with ProgressBar():
        import_manager = ImportManager()
        data, schema = import_manager.import_dataset(self.dictionary, path)
    with ProgressBar():
        self.save(data, schema)

load(self, **kwargs)

Loads a previously imported Biobank dataset.

Parameters:

Name Type Description Default
**kwargs

Dictionary of keyword arguments

{}

Returns:

Type Description
DataFrame

A Dask DataFrame object

Source code in biobank/dataset.py
def load(self, **kwargs) -> dd.DataFrame:
    """Loads a previously imported Biobank dataset.

    Args:
        **kwargs: Dictionary of keyword arguments

    Returns:
        A Dask DataFrame object
    """
    return dd.read_parquet(str(self.path), **kwargs)

load_metadata(self)

Loads metadata for the Biobank dataset.

Returns:

Type Description
FileMetaData

A FileMetadata object

Source code in biobank/dataset.py
def load_metadata(self) -> pq.FileMetaData:
    """Loads metadata for the Biobank dataset.

    Returns:
        A FileMetadata object
    """
    return pq.read_metadata(self.path / "_common_metadata")

save(self, data, schema, **kwargs)

Saves a Biobank dataset.

Parameters:

Name Type Description Default
data

Biobank dataset as a Dask Dataframe

required
schema

Parquet schema

required
**kwargs

Dictionary of keyword arguments

{}

Returns:

Type Description
None

None

Source code in biobank/dataset.py
def save(self, data, schema, **kwargs) -> None:
    """Saves a Biobank dataset.

    Args:
        data: Biobank dataset as a Dask Dataframe
        schema: Parquet schema
        **kwargs: Dictionary of keyword arguments

    Returns:
        None
    """
    print(f"saving dataset to {self.path}")
    data.to_parquet(
        self.path, schema=schema, compression="snappy", engine="pyarrow"
    )

select(self, fields=None, limit=None)

Select specific fields from the Biobank dataset.

Parameters:

Name Type Description Default
fields

List of fields to select

None
limit

Number of rows to select

None

Returns:

Type Description
DataFrame

A Pandas DataFrame

Source code in biobank/dataset.py
def select(self, fields=None, limit=None) -> pd.DataFrame:
    """Select specific fields from the Biobank dataset.

    Args:
        fields: List of fields to select
        limit: Number of rows to select

    Returns:
        A Pandas DataFrame
    """
    if fields:
        fields = self.match_fields(fields)
        if not len(fields):
            return pd.DataFrame()
    else:
        fields = None

    with ProgressBar():
        dataset = self.load(columns=fields, use_threads=True)
        if limit:
            dataset = dataset.loc[
                dataset.index.isin(dataset.index.head(limit))
            ]

        dataset = dataset.replace(
            to_replace={
                col: {np.nan: ""}
                for col in dataset.select_dtypes(
                    [np.float64, np.datetime64, object]
                ).columns
            }
        )
        dataset = dataset.compute()

    return dataset

Dictionary class for managing data dictionary.

Dictionary

Dictionary class for managing data dictionary.

__init__(self, path) special

Constructor.

Parameters:

Name Type Description Default
path Path

Directory to store the dictionary

required
Source code in biobank/dictionary.py
def __init__(self, path: Path):
    """Constructor.

    Args:
        path: Directory to store the dictionary
    """
    self._fields = None
    self.path = path / self.filename

download(self, path=None)

Download dictionary from URL.

Returns:

Type Description
None

None

Source code in biobank/dictionary.py
def download(self, path=None) -> None:
    """Download dictionary from URL.

    Returns:
        None
    """
    if not path:
        path = settings.dictionary.url

    self.path.parent.mkdir(parents=True, exist_ok=True)
    with ProgressBar():
        print(f"loading dictionary from {path}")
        dictionary = dd.read_table(path)
        dictionary = dictionary.compute()
        dictionary.to_parquet(self.path)

filter(self, fields, search)

Filter dictionary.

Parameters:

Name Type Description Default
fields List[str] required
search str required

Returns:

Type Description
DataFrame

None

Source code in biobank/dictionary.py
def filter(self, fields: List[str], search: str) -> pd.DataFrame:
    """Filter dictionary.

    Args:
        fields:
        search:

    Returns:
        None
    """
    dictionary = self.load()
    field_ids = set(map(self.get_field_id, fields))
    dictionary = dictionary[dictionary.FieldID.isin(field_ids)]
    if search:
        dictionary = dictionary[
            dictionary.Field.str.contains(search, case=False)
        ]
    return dictionary

get_field_id(self, field)

Get field ID.

Parameters:

Name Type Description Default
field str

Field name

required

Returns:

Type Description
str

Field ID as str

Source code in biobank/dictionary.py
def get_field_id(self, field: str) -> str:
    """Get field ID.

    Args:
        field: Field name

    Returns:
        Field ID as str
    """
    return field.split("-")[0]

get_pandas_dtype(self, field)

Get Pandas type for a field.

Parameters:

Name Type Description Default
field

Name of field

required

Returns:

Type Description
Any

Pandas dtype

Source code in biobank/dictionary.py
def get_pandas_dtype(self, field) -> Any:
    """Get Pandas type for a field.

    Args:
        field: Name of field

    Returns:
        Pandas dtype
    """
    field_type = self.get_type(field)
    if not field_type:
        return None

    if field_type == pa.int64():
        return "Int64"  # use pandas nullable integer type

    return field_type.to_pandas_dtype()

get_type(self, field)

Get Arrow data type for a field.

Parameters:

Name Type Description Default
field

Name of field

required

Returns:

Type Description
DataType

Arrow field type

Source code in biobank/dictionary.py
def get_type(self, field) -> pa.DataType:
    """Get Arrow data type for a field.

    Args:
        field: Name of field

    Returns:
        Arrow field type
    """
    field = self.get_field_id(field)
    if field in self.fields.index:
        return self.fields.loc[field].Type