"""Utilities functions for transform labels
"""
import numpy as np
import pandas as pd
import os, sys
from .misc import fuzzy_search
[docs]def convert_y_unit(y, from_, to_):
"""label unit conversion helper function
Args:
y (list): a list of labels
from_ (str): source units, 'nM'/'p'
to_ (str): target units, 'p'/'nM'
Returns:
np.array: a numpy array of transformed labels
"""
if from_ == "nM":
y = y
elif from_ == "p":
y = (10 ** (-y) - 1e-10) / 1e-9
if to_ == "p":
y = -np.log10(y * 1e-9 + 1e-10)
elif to_ == "nM":
y = y
return y
[docs]def convert_to_log(y):
"""log conversion helper
Args:
y (list): a list of labels
Returns:
np.array: an array of log-transformed labels
"""
y = convert_y_unit(np.array(y), "nM", "p")
return y
[docs]def convert_back_log(y):
"""conversion from log-scale helper
Args:
y (list): a list of labels in log-scale
Returns:
np.array: an array of nM->p labels
"""
y = convert_y_unit(np.array(y), "p", "nM")
return y
[docs]def binarize(y, threshold, order="ascending"):
"""binarization of a label list given a pre-specified threshold
Args:
y (list): a list of labels
threshold (float): the threshold for turning label to 1 or 0
order (str, optional): if order is ascending then for label that is above threshold becomes 1, and below becomes 0, vice versus
Returns:
np.array: an array of transformed labels
Raises:
AttributeError: select the correct order "ascending/descending"
"""
if order == "ascending":
y = np.array([1 if i else 0 for i in np.array(y) > threshold])
elif order == "descending":
y = np.array([1 if i else 0 for i in np.array(y) < threshold])
else:
raise AttributeError("'order' must be either ascending or descending")
return y
[docs]def label_dist(y, name=None):
"""plot the distribution of label
Args:
y (list): a list of labels
name (None, optional): dataset name
"""
try:
import seaborn as sns
import matplotlib.pyplot as plt
except:
from .misc import install
install("seaborn")
install("matplotlib")
import seaborn as sns
import matplotlib.pyplot as plt
median = np.median(y)
mean = np.mean(y)
f, (ax_box, ax_hist) = plt.subplots(
2, sharex=True, gridspec_kw={"height_ratios": (0.15, 1)}
)
if name is None:
sns.boxplot(y, ax=ax_box).set_title("Label Distribution")
else:
sns.boxplot(y, ax=ax_box).set_title(
"Label Distribution of " + str(name) + " Dataset"
)
ax_box.axvline(median, color="b", linestyle="--")
ax_box.axvline(mean, color="g", linestyle="--")
sns.distplot(y, ax=ax_hist)
ax_hist.axvline(median, color="b", linestyle="--")
ax_hist.axvline(mean, color="g", linestyle="--")
ax_hist.legend({"Median": median, "Mean": mean})
ax_box.set(xlabel="")
plt.show()
# print("The median is " + str(median), flush = True, file = sys.stderr)
# print("The mean is " + str(mean), flush = True, file = sys.stderr)
[docs]def NegSample(df, column_names, frac, two_types):
"""Negative Sampling for Binary Interaction Dataset
Args:
df (pandas.DataFrame): input dataset dataframe
column_names (list): column names in the order of [id1, x1, id2, x2]
frac (float): the ratio of negative samples compared to positive samples
two_types (bool): whether or not if the two entity types are different (e.g. drug-target) or single entity type (e.g. drug-drug)
Returns:
pandas.DataFrame: a new dataframe with negative samples (Y = 0)
"""
x = int(len(df) * frac)
id1, x1, id2, x2 = column_names
df[id1] = df[id1].apply(lambda x: str(x))
df[id2] = df[id2].apply(lambda x: str(x))
if not two_types:
df_unique = np.unique(df[[id1, id2]].values.reshape(-1))
pos = df[[id1, id2]].values
pos_set = set([tuple([i[0], i[1]]) for i in pos])
np.random.seed(1234)
samples = np.random.choice(df_unique, size=(x, 2), replace=True)
neg_set = set([tuple([i[0], i[1]]) for i in samples if i[0] != i[1]]) - pos_set
while len(neg_set) < x:
sample = np.random.choice(df_unique, 2, replace=False)
sample = tuple([sample[0], sample[1]])
if sample not in pos_set:
neg_set.add(sample)
neg_list = [list(i) for i in neg_set]
id2seq = dict(df[[id1, x1]].values)
id2seq.update(df[[id2, x2]].values)
neg_list_val = []
for i in neg_list:
neg_list_val.append([i[0], id2seq[i[0]], i[1], id2seq[i[1]], 0])
df = df.append(
pd.DataFrame(neg_list_val).rename(
columns={0: id1, 1: x1, 2: id2, 3: x2, 4: "Y"}
)
).reset_index(drop=True)
return df
else:
df_unique_id1 = np.unique(df[id1].values.reshape(-1))
df_unique_id2 = np.unique(df[id2].values.reshape(-1))
pos = df[[id1, id2]].values
pos_set = set([tuple([i[0], i[1]]) for i in pos])
np.random.seed(1234)
sample_id1 = np.random.choice(df_unique_id1, size=len(df), replace=True)
sample_id2 = np.random.choice(df_unique_id2, size=len(df), replace=True)
neg_set = (
set(
[
tuple([sample_id1[i], sample_id2[i]])
for i in range(len(df))
if sample_id1[i] != sample_id2[i]
]
)
- pos_set
)
while len(neg_set) < len(df):
sample_id1 = np.random.choice(df_unique_id1, size=1, replace=True)
sample_id2 = np.random.choice(df_unique_id2, size=1, replace=True)
sample = tuple([sample_id1[0], sample_id2[0]])
if sample not in pos_set:
neg_set.add(sample)
neg_list = [list(i) for i in neg_set]
id2seq1 = dict(df[[id1, x1]].values)
id2seq2 = dict(df[[id2, x2]].values)
neg_list_val = []
for i in neg_list:
neg_list_val.append([i[0], id2seq1[i[0]], i[1], id2seq2[i[1]], 0])
df = df.append(
pd.DataFrame(neg_list_val).rename(
columns={0: id1, 1: x1, 2: id2, 3: x2, 4: "Y"}
)
).reset_index(drop=True)
return df