Source code for tdc.utils.retrieve

"""Utilities functions for dataset/metadata retrieval
"""
import os, sys
import pandas as pd
from .label_name_list import dataset2target_lists
from .misc import fuzzy_search
from .load import pd_load
from ..metadata import dataset_names, benchmark_names, dataset_list


[docs]def get_label_map( name, path="./data", target=None, file_format="csv", output_format="dict", task="DDI", name_column="Map", ): """to retrieve the biomedical meaning of labels Args: name (str): the name of the dataset path (str, optional): the dataset path, where the data is located target (None, optional): the label name file_format (str, optional): format of the file output_format (str, optional): return a dictionary or a dataframe or the raw array of mapped labels task (str, optional): the name of the task name_column (str, optional): the name of the column that stores the label name Returns: dict/pd.DataFrame/np.array: when output_format is dict/df/array Raises: ValueError: output_format not supported. """ name = fuzzy_search(name, dataset_names[task]) if target is None: target = "Y" df = pd_load(name, path) if output_format == "dict": return dict(zip(df[target].values, df[name_column].values)) elif output_format == "df": return df elif output_format == "array": return df[name_column].values else: raise ValueError( "Please use the correct output format, select from dict, df, array." )
[docs]def get_reaction_type(name, path="./data", output_format="array"): """to retrieve the type of reactions for reaction dataset Args: name (str): dataset name path (str, optional): dataset path output_format (str, optional): output format in dataframe or in raw array format Returns: pd.DataFrame/np.array: when output_format is df/array Raises: ValueError: the output format is not supported """ name = fuzzy_search(name, dataset_names["RetroSyn"]) df = pd_load(name, path) if output_format == "df": return df elif output_format == "array": return df["category"].values else: raise ValueError("Please use the correct output format, select from df, array.")
[docs]def retrieve_label_name_list(name): """get the set of available labels for query dataset Args: name (str): rough dataset name Returns: list: a list of available labels """ name = fuzzy_search(name, dataset_list) return dataset2target_lists[name]
[docs]def retrieve_dataset_names(name): """to get all available dataset names given a task Args: name (str): the name of query task Returns: list: a list of available datasets """ return dataset_names[name]
[docs]def retrieve_all_benchmarks(): """to get all available benchmark groups Returns: list: a list of benchmark group names """ return list(benchmark_names.keys())
[docs]def retrieve_benchmark_names(name): """to get all available benchmarks given a query benchmark group Args: name (str): the name of the benchmark group Returns: list: a list of benchmarks """ name = fuzzy_search(name, list(benchmark_names.keys())) datasets = benchmark_names[name] dataset_names = [] for task, datasets in datasets.items(): for dataset in datasets: dataset_names.append(dataset) return dataset_names