Reference

Biobank Dataset class.

`Dataset` ¶

Biobank Dataset class.

`init(self)` `special` ¶

Constructor.

Source code in biobank/dataset.py

def __init__(self):
    """Constructor."""
    path = settings.path.absolute()
    self.path = path / self.filename
    self.dictionary = Dictionary(path)

`delete(self)` ¶

Delete the dataset.

Returns:

Type	Description
`None`	None

Source code in biobank/dataset.py

def delete(self) -> None:
    """Delete the dataset.

    Returns:
        None
    """
    if self.path.is_file():
        self.path.unlink()
    else:
        shutil.rmtree(str(self.path))

`import_dataset(self, path, dictionary)` ¶

Import a dataset.

Parameters:

Name	Type	Description	Default
`path`		URL or local path of dataset to import	required

Returns:

Type	Description
`None`	None

Source code in biobank/dataset.py

def import_dataset(self, path, dictionary) -> None:
    """Import a dataset.

    Args:
        path: URL or local path of dataset to import

    Returns:
        None
    """
    self.dictionary.load(dictionary, download=True)
    with ProgressBar():
        import_manager = ImportManager()
        data, schema = import_manager.import_dataset(self.dictionary, path)
    with ProgressBar():
        self.save(data, schema)

`load(self, **kwargs)` ¶

Loads a previously imported Biobank dataset.

Parameters:

Name	Type	Description	Default
`**kwargs`		Dictionary of keyword arguments	`{}`

Returns:

Type	Description
`DataFrame`	A Dask DataFrame object

Source code in biobank/dataset.py

def load(self, **kwargs) -> dd.DataFrame:
    """Loads a previously imported Biobank dataset.

    Args:
        **kwargs: Dictionary of keyword arguments

    Returns:
        A Dask DataFrame object
    """
    return dd.read_parquet(str(self.path), **kwargs)

`load_metadata(self)` ¶

Loads metadata for the Biobank dataset.

Returns:

Type	Description
`FileMetaData`	A FileMetadata object

Source code in biobank/dataset.py

def load_metadata(self) -> pq.FileMetaData:
    """Loads metadata for the Biobank dataset.

    Returns:
        A FileMetadata object
    """
    return pq.read_metadata(self.path / "_common_metadata")

`save(self, data, schema, **kwargs)` ¶

Saves a Biobank dataset.

Parameters:

Name	Description	Default
`data`	Biobank dataset as a Dask Dataframe	required
`schema`	Parquet schema	required
`**kwargs`	Dictionary of keyword arguments	`{}`

Returns:

Type	Description
`None`	None

Source code in biobank/dataset.py

def save(self, data, schema, **kwargs) -> None:
    """Saves a Biobank dataset.

    Args:
        data: Biobank dataset as a Dask Dataframe
        schema: Parquet schema
        **kwargs: Dictionary of keyword arguments

    Returns:
        None
    """
    print(f"saving dataset to {self.path}")
    data.to_parquet(
        self.path, schema=schema, compression="snappy", engine="pyarrow"
    )

`select(self, fields=None, limit=None)` ¶

Select specific fields from the Biobank dataset.

Parameters:

Name	Type	Description	Default
`fields`		List of fields to select	`None`
`limit`		Number of rows to select	`None`

Returns:

Type	Description
`DataFrame`	A Pandas DataFrame

Source code in biobank/dataset.py

def select(self, fields=None, limit=None) -> pd.DataFrame:
    """Select specific fields from the Biobank dataset.

    Args:
        fields: List of fields to select
        limit: Number of rows to select

    Returns:
        A Pandas DataFrame
    """
    if fields:
        fields = self.match_fields(fields)
        if not len(fields):
            return pd.DataFrame()
    else:
        fields = None

    with ProgressBar():
        dataset = self.load(columns=fields, use_threads=True)
        if limit:
            dataset = dataset.loc[
                dataset.index.isin(dataset.index.head(limit))
            ]

        dataset = dataset.replace(
            to_replace={
                col: {np.nan: ""}
                for col in dataset.select_dtypes(
                    [np.float64, np.datetime64, object]
                ).columns
            }
        )
        dataset = dataset.compute()

    return dataset

Dictionary class for managing data dictionary.

`Dictionary` ¶

Dictionary class for managing data dictionary.

`init(self, path)` `special` ¶

Constructor.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Directory to store the dictionary	required

Source code in biobank/dictionary.py

def __init__(self, path: Path):
    """Constructor.

    Args:
        path: Directory to store the dictionary
    """
    self._fields = None
    self.path = path / self.filename

`download(self, path=None)` ¶

Download dictionary from URL.

Returns:

Type	Description
`None`	None

Source code in biobank/dictionary.py

def download(self, path=None) -> None:
    """Download dictionary from URL.

    Returns:
        None
    """
    if not path:
        path = settings.dictionary.url

    self.path.parent.mkdir(parents=True, exist_ok=True)
    with ProgressBar():
        print(f"loading dictionary from {path}")
        dictionary = dd.read_table(path)
        dictionary = dictionary.compute()
        dictionary.to_parquet(self.path)

`filter(self, fields, search)` ¶

Filter dictionary.

Parameters:

Name	Type	Description	Default
`fields`	`List[str]`		required
`search`	`str`		required

Returns:

Type	Description
`DataFrame`	None

Source code in biobank/dictionary.py

def filter(self, fields: List[str], search: str) -> pd.DataFrame:
    """Filter dictionary.

    Args:
        fields:
        search:

    Returns:
        None
    """
    dictionary = self.load()
    field_ids = set(map(self.get_field_id, fields))
    dictionary = dictionary[dictionary.FieldID.isin(field_ids)]
    if search:
        dictionary = dictionary[
            dictionary.Field.str.contains(search, case=False)
        ]
    return dictionary

`get_field_id(self, field)` ¶

Get field ID.

Parameters:

Name	Type	Description	Default
`field`	`str`	Field name	required

Returns:

Type	Description
`str`	Field ID as str

Source code in biobank/dictionary.py

def get_field_id(self, field: str) -> str:
    """Get field ID.

    Args:
        field: Field name

    Returns:
        Field ID as str
    """
    return field.split("-")[0]

`get_pandas_dtype(self, field)` ¶

Get Pandas type for a field.

Parameters:

Name	Type	Description	Default
`field`		Name of field	required

Returns:

Type	Description
`Any`	Pandas dtype

Source code in biobank/dictionary.py

def get_pandas_dtype(self, field) -> Any:
    """Get Pandas type for a field.

    Args:
        field: Name of field

    Returns:
        Pandas dtype
    """
    field_type = self.get_type(field)
    if not field_type:
        return None

    if field_type == pa.int64():
        return "Int64"  # use pandas nullable integer type

    return field_type.to_pandas_dtype()

`get_type(self, field)` ¶

Get Arrow data type for a field.

Parameters:

Name	Type	Description	Default
`field`		Name of field	required

Returns:

Type	Description
`DataType`	Arrow field type

Source code in biobank/dictionary.py

def get_type(self, field) -> pa.DataType:
    """Get Arrow data type for a field.

    Args:
        field: Name of field

    Returns:
        Arrow field type
    """
    field = self.get_field_id(field)
    if field in self.fields.index:
        return self.fields.loc[field].Type

Reference

Dataset ¶

__init__(self) special ¶

delete(self) ¶

import_dataset(self, path, dictionary) ¶

load(self, **kwargs) ¶

load_metadata(self) ¶

save(self, data, schema, **kwargs) ¶

select(self, fields=None, limit=None) ¶

Dictionary ¶

__init__(self, path) special ¶

download(self, path=None) ¶

filter(self, fields, search) ¶

get_field_id(self, field) ¶

get_pandas_dtype(self, field) ¶

get_type(self, field) ¶

`Dataset` ¶

`init(self)` `special` ¶

`delete(self)` ¶

`import_dataset(self, path, dictionary)` ¶

`load(self, **kwargs)` ¶

`load_metadata(self)` ¶

`save(self, data, schema, **kwargs)` ¶

`select(self, fields=None, limit=None)` ¶

`Dictionary` ¶

`init(self, path)` `special` ¶

`download(self, path=None)` ¶

`filter(self, fields, search)` ¶

`get_field_id(self, field)` ¶

`get_pandas_dtype(self, field)` ¶

`get_type(self, field)` ¶