Source code for tdc.utils.retrieve

"""Utilities functions for dataset/metadata retrieval
"""
import os, sys
import pandas as pd
from .label_name_list import dataset2target_lists
from .misc import fuzzy_search
from .load import pd_load
from ..metadata import dataset_names, benchmark_names, dataset_list

[docs]def get_label_map(name, path = './data', target = None, file_format = 'csv', output_format = 'dict', task = 'DDI', name_column = 'Map'): """to retrieve the biomedical meaning of labels Args: name (str): the name of the dataset path (str, optional): the dataset path, where the data is located target (None, optional): the label name file_format (str, optional): format of the file output_format (str, optional): return a dictionary or a dataframe or the raw array of mapped labels task (str, optional): the name of the task name_column (str, optional): the name of the column that stores the label name Returns: dict/pd.DataFrame/np.array: when output_format is dict/df/array Raises: ValueError: output_format not supported. """ name = fuzzy_search(name, dataset_names[task]) if target is None: target = 'Y' df = pd_load(name, path) if output_format == 'dict': return dict(zip(df[target].values, df[name_column].values)) elif output_format == 'df': return df elif output_format == 'array': return df[name_column].values else: raise ValueError("Please use the correct output format, select from dict, df, array.")
[docs]def get_reaction_type(name, path = './data', output_format = 'array'): """to retrieve the type of reactions for reaction dataset Args: name (str): dataset name path (str, optional): dataset path output_format (str, optional): output format in dataframe or in raw array format Returns: pd.DataFrame/np.array: when output_format is df/array Raises: ValueError: the output format is not supported """ name = fuzzy_search(name, dataset_names['RetroSyn']) df = pd_load(name, path) if output_format == 'df': return df elif output_format == 'array': return df['category'].values else: raise ValueError("Please use the correct output format, select from df, array.")
[docs]def retrieve_label_name_list(name): """get the set of available labels for query dataset Args: name (str): rough dataset name Returns: list: a list of available labels """ name = fuzzy_search(name, dataset_list) return dataset2target_lists[name]
[docs]def retrieve_dataset_names(name): """to get all available dataset names given a task Args: name (str): the name of query task Returns: list: a list of available datasets """ return dataset_names[name]
[docs]def retrieve_all_benchmarks(): """to get all available benchmark groups Returns: list: a list of benchmark group names """ return list(benchmark_names.keys())
[docs]def retrieve_benchmark_names(name): """to get all available benchmarks given a query benchmark group Args: name (str): the name of the benchmark group Returns: list: a list of benchmarks """ name = fuzzy_search(name, list(benchmark_names.keys())) datasets = benchmark_names[name] dataset_names = [] for task, datasets in datasets.items(): for dataset in datasets: dataset_names.append(dataset) return dataset_names