Source code for tdc.utils.label

"""Utilities functions for transform labels
"""
import numpy as np
import pandas as pd
import os, sys

from .misc import fuzzy_search


[docs]def convert_y_unit(y, from_, to_): """label unit conversion helper function Args: y (list): a list of labels from_ (str): source units, 'nM'/'p' to_ (str): target units, 'p'/'nM' Returns: np.array: a numpy array of transformed labels """ if from_ == "nM": y = y elif from_ == "p": y = (10 ** (-y) - 1e-10) / 1e-9 if to_ == "p": y = -np.log10(y * 1e-9 + 1e-10) elif to_ == "nM": y = y return y
[docs]def label_transform( y, binary, threshold, convert_to_log, verbose=True, order="descending" ): """label transformation helper function Args: y (list): a list of labels binary (bool): whether or not to conduct binarization threshold (float): the threshold for binarization convert_to_log (bool): convert to log-scale for continuous values such as Kd and etc verbose (bool, optional): whether or not to print intermediate processing statements order (str, optional): if descending, then label is 1 for value less than threshold and vice versus, defaults to 'descending' Returns: np.array: an array of transformed labels Raises: ValueError: specify the correct order from 'descending'/'ascending' """ if (len(np.unique(y)) > 2) and binary: if verbose: print( "Binariztion using threshold' + str(threshold) + ', you use specify your threhsold values by threshold = X)", flush=True, file=sys.stderr, ) if order == "descending": y = np.array([1 if i else 0 for i in np.array(y) < threshold]) elif order == "ascending": y = np.array([1 if i else 0 for i in np.array(y) > threshold]) else: raise ValueError("Please select order from 'descending or ascending!") else: if (len(np.unique(y)) > 2) and convert_to_log: if verbose: print("To log space...", flush=True, file=sys.stderr) y = convert_y_unit(np.array(y), "nM", "p") else: y = y return y
[docs]def convert_to_log(y): """log conversion helper Args: y (list): a list of labels Returns: np.array: an array of log-transformed labels """ y = convert_y_unit(np.array(y), "nM", "p") return y
[docs]def convert_back_log(y): """conversion from log-scale helper Args: y (list): a list of labels in log-scale Returns: np.array: an array of nM->p labels """ y = convert_y_unit(np.array(y), "p", "nM") return y
[docs]def binarize(y, threshold, order="ascending"): """binarization of a label list given a pre-specified threshold Args: y (list): a list of labels threshold (float): the threshold for turning label to 1 or 0 order (str, optional): if order is ascending then for label that is above threshold becomes 1, and below becomes 0, vice versus Returns: np.array: an array of transformed labels Raises: AttributeError: select the correct order "ascending/descending" """ if order == "ascending": y = np.array([1 if i else 0 for i in np.array(y) > threshold]) elif order == "descending": y = np.array([1 if i else 0 for i in np.array(y) < threshold]) else: raise AttributeError("'order' must be either ascending or descending") return y
[docs]def label_dist(y, name=None): """plot the distribution of label Args: y (list): a list of labels name (None, optional): dataset name """ try: import seaborn as sns import matplotlib.pyplot as plt except: from .misc import install install("seaborn") install("matplotlib") import seaborn as sns import matplotlib.pyplot as plt median = np.median(y) mean = np.mean(y) f, (ax_box, ax_hist) = plt.subplots( 2, sharex=True, gridspec_kw={"height_ratios": (0.15, 1)} ) if name is None: sns.boxplot(y, ax=ax_box).set_title("Label Distribution") else: sns.boxplot(y, ax=ax_box).set_title( "Label Distribution of " + str(name) + " Dataset" ) ax_box.axvline(median, color="b", linestyle="--") ax_box.axvline(mean, color="g", linestyle="--") sns.distplot(y, ax=ax_hist) ax_hist.axvline(median, color="b", linestyle="--") ax_hist.axvline(mean, color="g", linestyle="--") ax_hist.legend({"Median": median, "Mean": mean}) ax_box.set(xlabel="") plt.show()
# print("The median is " + str(median), flush = True, file = sys.stderr) # print("The mean is " + str(mean), flush = True, file = sys.stderr)
[docs]def NegSample(df, column_names, frac, two_types): """Negative Sampling for Binary Interaction Dataset Args: df (pandas.DataFrame): input dataset dataframe column_names (list): column names in the order of [id1, x1, id2, x2] frac (float): the ratio of negative samples compared to positive samples two_types (bool): whether or not if the two entity types are different (e.g. drug-target) or single entity type (e.g. drug-drug) Returns: pandas.DataFrame: a new dataframe with negative samples (Y = 0) """ x = int(len(df) * frac) id1, x1, id2, x2 = column_names df[id1] = df[id1].apply(lambda x: str(x)) df[id2] = df[id2].apply(lambda x: str(x)) if not two_types: df_unique = np.unique(df[[id1, id2]].values.reshape(-1)) pos = df[[id1, id2]].values pos_set = set([tuple([i[0], i[1]]) for i in pos]) np.random.seed(1234) samples = np.random.choice(df_unique, size=(x, 2), replace=True) neg_set = set([tuple([i[0], i[1]]) for i in samples if i[0] != i[1]]) - pos_set while len(neg_set) < x: sample = np.random.choice(df_unique, 2, replace=False) sample = tuple([sample[0], sample[1]]) if sample not in pos_set: neg_set.add(sample) neg_list = [list(i) for i in neg_set] id2seq = dict(df[[id1, x1]].values) id2seq.update(df[[id2, x2]].values) neg_list_val = [] for i in neg_list: neg_list_val.append([i[0], id2seq[i[0]], i[1], id2seq[i[1]], 0]) df = df.append( pd.DataFrame(neg_list_val).rename( columns={0: id1, 1: x1, 2: id2, 3: x2, 4: "Y"} ) ).reset_index(drop=True) return df else: df_unique_id1 = np.unique(df[id1].values.reshape(-1)) df_unique_id2 = np.unique(df[id2].values.reshape(-1)) pos = df[[id1, id2]].values pos_set = set([tuple([i[0], i[1]]) for i in pos]) np.random.seed(1234) sample_id1 = np.random.choice(df_unique_id1, size=len(df), replace=True) sample_id2 = np.random.choice(df_unique_id2, size=len(df), replace=True) neg_set = ( set( [ tuple([sample_id1[i], sample_id2[i]]) for i in range(len(df)) if sample_id1[i] != sample_id2[i] ] ) - pos_set ) while len(neg_set) < len(df): sample_id1 = np.random.choice(df_unique_id1, size=1, replace=True) sample_id2 = np.random.choice(df_unique_id2, size=1, replace=True) sample = tuple([sample_id1[0], sample_id2[0]]) if sample not in pos_set: neg_set.add(sample) neg_list = [list(i) for i in neg_set] id2seq1 = dict(df[[id1, x1]].values) id2seq2 = dict(df[[id2, x2]].values) neg_list_val = [] for i in neg_list: neg_list_val.append([i[0], id2seq1[i[0]], i[1], id2seq2[i[1]], 0]) df = df.append( pd.DataFrame(neg_list_val).rename( columns={0: id1, 1: x1, 2: id2, 3: x2, 4: "Y"} ) ).reset_index(drop=True) return df