"""Utilities functions for dataset/metadata retrieval
"""
import os, sys
import pandas as pd
from .label_name_list import dataset2target_lists
from .misc import fuzzy_search
from .load import pd_load
from ..metadata import dataset_names, benchmark_names, dataset_list
[docs]def get_label_map(name, path = './data', target = None, file_format = 'csv', output_format = 'dict', task = 'DDI', name_column = 'Map'):
"""to retrieve the biomedical meaning of labels
Args:
name (str): the name of the dataset
path (str, optional): the dataset path, where the data is located
target (None, optional): the label name
file_format (str, optional): format of the file
output_format (str, optional): return a dictionary or a dataframe or the raw array of mapped labels
task (str, optional): the name of the task
name_column (str, optional): the name of the column that stores the label name
Returns:
dict/pd.DataFrame/np.array: when output_format is dict/df/array
Raises:
ValueError: output_format not supported.
"""
name = fuzzy_search(name, dataset_names[task])
if target is None:
target = 'Y'
df = pd_load(name, path)
if output_format == 'dict':
return dict(zip(df[target].values, df[name_column].values))
elif output_format == 'df':
return df
elif output_format == 'array':
return df[name_column].values
else:
raise ValueError("Please use the correct output format, select from dict, df, array.")
[docs]def get_reaction_type(name, path = './data', output_format = 'array'):
"""to retrieve the type of reactions for reaction dataset
Args:
name (str): dataset name
path (str, optional): dataset path
output_format (str, optional): output format in dataframe or in raw array format
Returns:
pd.DataFrame/np.array: when output_format is df/array
Raises:
ValueError: the output format is not supported
"""
name = fuzzy_search(name, dataset_names['RetroSyn'])
df = pd_load(name, path)
if output_format == 'df':
return df
elif output_format == 'array':
return df['category'].values
else:
raise ValueError("Please use the correct output format, select from df, array.")
[docs]def retrieve_label_name_list(name):
"""get the set of available labels for query dataset
Args:
name (str): rough dataset name
Returns:
list: a list of available labels
"""
name = fuzzy_search(name, dataset_list)
return dataset2target_lists[name]
[docs]def retrieve_dataset_names(name):
"""to get all available dataset names given a task
Args:
name (str): the name of query task
Returns:
list: a list of available datasets
"""
return dataset_names[name]
[docs]def retrieve_all_benchmarks():
"""to get all available benchmark groups
Returns:
list: a list of benchmark group names
"""
return list(benchmark_names.keys())
[docs]def retrieve_benchmark_names(name):
"""to get all available benchmarks given a query benchmark group
Args:
name (str): the name of the benchmark group
Returns:
list: a list of benchmarks
"""
name = fuzzy_search(name, list(benchmark_names.keys()))
datasets = benchmark_names[name]
dataset_names = []
for task, datasets in datasets.items():
for dataset in datasets:
dataset_names.append(dataset)
return dataset_names