Reference
Biobank Dataset class.
Dataset
¶
Biobank Dataset class.
__init__(self)
special
¶
Constructor.
Source code in biobank/dataset.py
def __init__(self):
"""Constructor."""
path = settings.path.absolute()
self.path = path / self.filename
self.dictionary = Dictionary(path)
delete(self)
¶
Delete the dataset.
Returns:
Type | Description |
---|---|
None |
None |
Source code in biobank/dataset.py
def delete(self) -> None:
"""Delete the dataset.
Returns:
None
"""
if self.path.is_file():
self.path.unlink()
else:
shutil.rmtree(str(self.path))
import_dataset(self, path, dictionary)
¶
Import a dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
|
URL or local path of dataset to import |
required |
Returns:
Type | Description |
---|---|
None |
None |
Source code in biobank/dataset.py
def import_dataset(self, path, dictionary) -> None:
"""Import a dataset.
Args:
path: URL or local path of dataset to import
Returns:
None
"""
self.dictionary.load(dictionary, download=True)
with ProgressBar():
import_manager = ImportManager()
data, schema = import_manager.import_dataset(self.dictionary, path)
with ProgressBar():
self.save(data, schema)
load(self, **kwargs)
¶
Loads a previously imported Biobank dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
**kwargs |
|
Dictionary of keyword arguments |
{} |
Returns:
Type | Description |
---|---|
DataFrame |
A Dask DataFrame object |
Source code in biobank/dataset.py
def load(self, **kwargs) -> dd.DataFrame:
"""Loads a previously imported Biobank dataset.
Args:
**kwargs: Dictionary of keyword arguments
Returns:
A Dask DataFrame object
"""
return dd.read_parquet(str(self.path), **kwargs)
load_metadata(self)
¶
Loads metadata for the Biobank dataset.
Returns:
Type | Description |
---|---|
FileMetaData |
A FileMetadata object |
Source code in biobank/dataset.py
def load_metadata(self) -> pq.FileMetaData:
"""Loads metadata for the Biobank dataset.
Returns:
A FileMetadata object
"""
return pq.read_metadata(self.path / "_common_metadata")
save(self, data, schema, **kwargs)
¶
Saves a Biobank dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
|
Biobank dataset as a Dask Dataframe |
required |
schema |
|
Parquet schema |
required |
**kwargs |
|
Dictionary of keyword arguments |
{} |
Returns:
Type | Description |
---|---|
None |
None |
Source code in biobank/dataset.py
def save(self, data, schema, **kwargs) -> None:
"""Saves a Biobank dataset.
Args:
data: Biobank dataset as a Dask Dataframe
schema: Parquet schema
**kwargs: Dictionary of keyword arguments
Returns:
None
"""
print(f"saving dataset to {self.path}")
data.to_parquet(
self.path, schema=schema, compression="snappy", engine="pyarrow"
)
select(self, fields=None, limit=None)
¶
Select specific fields from the Biobank dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fields |
|
List of fields to select |
None |
limit |
|
Number of rows to select |
None |
Returns:
Type | Description |
---|---|
DataFrame |
A Pandas DataFrame |
Source code in biobank/dataset.py
def select(self, fields=None, limit=None) -> pd.DataFrame:
"""Select specific fields from the Biobank dataset.
Args:
fields: List of fields to select
limit: Number of rows to select
Returns:
A Pandas DataFrame
"""
if fields:
fields = self.match_fields(fields)
if not len(fields):
return pd.DataFrame()
else:
fields = None
with ProgressBar():
dataset = self.load(columns=fields, use_threads=True)
if limit:
dataset = dataset.loc[
dataset.index.isin(dataset.index.head(limit))
]
dataset = dataset.replace(
to_replace={
col: {np.nan: ""}
for col in dataset.select_dtypes(
[np.float64, np.datetime64, object]
).columns
}
)
dataset = dataset.compute()
return dataset
Dictionary class for managing data dictionary.
Dictionary
¶
Dictionary class for managing data dictionary.
__init__(self, path)
special
¶
Constructor.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path |
Directory to store the dictionary |
required |
Source code in biobank/dictionary.py
def __init__(self, path: Path):
"""Constructor.
Args:
path: Directory to store the dictionary
"""
self._fields = None
self.path = path / self.filename
download(self, path=None)
¶
Download dictionary from URL.
Returns:
Type | Description |
---|---|
None |
None |
Source code in biobank/dictionary.py
def download(self, path=None) -> None:
"""Download dictionary from URL.
Returns:
None
"""
if not path:
path = settings.dictionary.url
self.path.parent.mkdir(parents=True, exist_ok=True)
with ProgressBar():
print(f"loading dictionary from {path}")
dictionary = dd.read_table(path)
dictionary = dictionary.compute()
dictionary.to_parquet(self.path)
filter(self, fields, search)
¶
Filter dictionary.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fields |
List[str] |
required | |
search |
str |
required |
Returns:
Type | Description |
---|---|
DataFrame |
None |
Source code in biobank/dictionary.py
def filter(self, fields: List[str], search: str) -> pd.DataFrame:
"""Filter dictionary.
Args:
fields:
search:
Returns:
None
"""
dictionary = self.load()
field_ids = set(map(self.get_field_id, fields))
dictionary = dictionary[dictionary.FieldID.isin(field_ids)]
if search:
dictionary = dictionary[
dictionary.Field.str.contains(search, case=False)
]
return dictionary
get_field_id(self, field)
¶
Get field ID.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
field |
str |
Field name |
required |
Returns:
Type | Description |
---|---|
str |
Field ID as str |
Source code in biobank/dictionary.py
def get_field_id(self, field: str) -> str:
"""Get field ID.
Args:
field: Field name
Returns:
Field ID as str
"""
return field.split("-")[0]
get_pandas_dtype(self, field)
¶
Get Pandas type for a field.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
field |
|
Name of field |
required |
Returns:
Type | Description |
---|---|
Any |
Pandas dtype |
Source code in biobank/dictionary.py
def get_pandas_dtype(self, field) -> Any:
"""Get Pandas type for a field.
Args:
field: Name of field
Returns:
Pandas dtype
"""
field_type = self.get_type(field)
if not field_type:
return None
if field_type == pa.int64():
return "Int64" # use pandas nullable integer type
return field_type.to_pandas_dtype()
get_type(self, field)
¶
Get Arrow data type for a field.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
field |
|
Name of field |
required |
Returns:
Type | Description |
---|---|
DataType |
Arrow field type |
Source code in biobank/dictionary.py
def get_type(self, field) -> pa.DataType:
"""Get Arrow data type for a field.
Args:
field: Name of field
Returns:
Arrow field type
"""
field = self.get_field_id(field)
if field in self.fields.index:
return self.fields.loc[field].Type