Source code for tdc.benchmark_group.docking_group

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT

import pandas as pd
import numpy as np
import os, sys, json 
import warnings
warnings.filterwarnings("ignore")

from .base_group import BenchmarkGroup
from ..utils import bm_group_load, print_sys, fuzzy_search
from ..metadata import get_task2category, bm_metric_names, benchmark_names, bm_split_names, docking_target_info
from ..evaluator import Evaluator

[docs]class docking_group(BenchmarkGroup): """Create a docking group benchmark loader. Args: path (str, optional): the folder path to save/load the benchmarks. pyscreener_path (str, optional): the path to pyscreener repository in order to call docking scores. num_workers (int, optional): number of workers to parallelize dockings num_cpus (int, optional): number of CPUs assigned to docking num_max_call (int, optional): maximum number of oracle calls """ def __init__(self, path = './data', num_workers = None, num_cpus = None, num_max_call = 5000): """Create a docking group benchmark loader. Raises: ValueError: missing path to pyscreener. """ super().__init__(name = 'Docking_Group', path = path, file_format = 'oracle') # if pyscreener_path is not None: # self.pyscreener_path = pyscreener_path # else: # raise ValueError("Please specify pyscreener_path!") if (num_workers is None) and (num_cpus is None): ## automatic selections cpu_total = os.cpu_count() if cpu_total > 1: num_cpus = 2 else: num_cpus = 1 num_workers = int(cpu_total/num_cpus) self.num_workers = num_workers self.num_cpus = num_cpus self.num_max_call = num_max_call from ..oracles import Oracle def __iter__(self): """iterate docking targets Returns: docking_group: the docking group class itself """ self.index = 0 self.num_datasets = len(self.dataset_names) return self def __next__(self): """retrieve the next benchmark Returns: dict: a dictionary of oracle function, molecule library dataset, and the name of docking target Raises: StopIteration: stop when all benchmarks are obtained. """ if self.index < self.num_datasets: dataset = self.dataset_names[self.index] print_sys('--- ' + dataset + ' ---') data_path = os.path.join(self.path, dataset) if not os.path.exists(data_path): os.mkdir(data_path) target_pdb_file = os.path.join(self.path, dataset + '.pdb') self.index += 1 from ..oracles import Oracle # oracle = Oracle(name = "Docking_Score", software="vina", # pyscreener_path = self.pyscreener_path, # receptors=[target_pdb_file], # center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], # buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call = self.num_max_call) oracle = Oracle(name = "Docking_Score", pyscreener_path = self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call = self.num_max_call) data = pd.read_csv(os.path.join(self.path, 'zinc.tab'), sep = '\t') return {'oracle': oracle, 'data': data, 'name': dataset} else: raise StopIteration
[docs] def get_train_valid_split(self, seed, benchmark, split_type = 'default'): """no split for docking group Raises: ValueError: no split for docking group """ raise ValueError("Docking molecule generation does not have the concept of training/testing split! Checkout the usage in tdcommons.ai !")
[docs] def get(self, benchmark, num_max_call = 5000): """retrieve one benchmark given benchmark name (docking target) Args: benchmark (str): the name of the benchmark num_max_call (int, optional): maximum of oracle calls Returns: dict: a dictionary of oracle function, molecule library dataset, and the name of docking target """ dataset = fuzzy_search(benchmark, self.dataset_names) data_path = os.path.join(self.path, dataset) target_pdbqt_file = os.path.join(self.path, dataset + '.pdbqt') from ..oracles import Oracle # oracle = Oracle(name = "Docking_Score", software="vina", # pyscreener_path = self.pyscreener_path, # receptors=[target_pdb_file], # center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], # buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call = num_max_call) # oracle = Oracle(name = "Docking_Score", # receptor_pdbqt_file=target_pdbqt_file, # center=docking_target_info[dataset]['center'], # box_size=docking_target_info[dataset]['size'], # num_max_call = num_max_call) oracle = Oracle(name = "3pbl_docking") data = pd.read_csv(os.path.join(self.path, 'zinc.tab'), sep = '\t') return {'oracle': oracle, 'data': data, 'name': dataset}
[docs] def evaluate(self, pred, true = None, benchmark = None, m1_api = None, save_dict = True): """Summary Args: pred (dict): a nested dictionary, where the first level key is the docking target, the value is another dictionary where the key is the maximum oracle calls, and value can have two options. One, a dictionary of SMILES paired up with the docking scores and Second, a list of SMILES strings, where the function will generate the docking scores automatically. benchmark (str, optional): name of the benchmark docking target. m1_api (str, optional): API token of Molecule.One. This is to use M1 service to generate synthesis score. save_dict (bool, optional): whether or not to save the results. Returns: dict: result with all realistic metrics generated Raises: ValueError: Description """ results_all = {} for data_name, pred_all in pred.items(): results_max_call = {} for num_max_call, pred_ in pred_all.items(): results = {} recalc = False if isinstance(pred_, dict): print_sys("The input is a dictionary, expected to have SMILES string as key and docking score as value!") docking_scores = pred_ pred_ = list(pred_.keys()) elif isinstance(pred_, list): recalc = True print_sys("The input is a list, docking score will be computed! If you already have the docking scores, please make the list as a dictionary with SMILES string as key and docking score as value") else: raise ValueError("The input prediction must be a dictionary with SMILES and their docking scores or a list of SMILES!") ## pred is a list of smiles strings or a dictionary of smiles strings if docking scores are already calculated... if len(pred_) != 100: raise ValueError("The expected output is a list/dictionary of top 100 molecules!") if recalc: dataset = fuzzy_search(data_name, self.dataset_names) # docking scores for the top K smiles (K <= 100) target_pdb_file = os.path.join(self.path, dataset + '.pdb') from ..oracles import Oracle data_path = os.path.join(self.path, dataset) oracle = Oracle(name = "Docking_Score", software="vina", pyscreener_path = self.pyscreener_path, receptors=[target_pdb_file], center=docking_target_info[dataset]['center'], size=docking_target_info[dataset]['size'], buffer=10, path=data_path, num_worker=self.num_workers, ncpu=self.num_cpus, num_max_call = 10000) docking_scores = oracle(pred_) print_sys("---- Calculating average docking scores ----") if len(np.where(np.array(list(docking_scores.values()))>0)[0]) > 0.7: ## check if the scores are all positive.. if so, make them all negative docking_scores = {j: -k for j, k in docking_scores.items()} if save_dict: results['docking_scores_dict'] = docking_scores values = np.array(list(docking_scores.values())) results['top100'] = np.mean(values) results['top10'] = np.mean(sorted(values)[:10]) results['top1'] = min(values) # if m1_api is None: # print_sys('Ignoring M1 Synthesizability Evaluations. You can still submit your results without m1 score. Although for the submission, we encourage inclusion of m1 scores. To opt-in, set the m1_api to the token obtained via: https://tdcommons.ai/functions/oracles/#moleculeone') # else: # print_sys("---- Calculating molecule.one synthesizability score ----") # from ..oracles import Oracle # m1 = Oracle(name = 'Molecule One Synthesis', api_token = m1_api) # import heapq # from operator import itemgetter # top10_docking_smiles = list(dict(heapq.nsmallest(10, docking_scores.items(), key=itemgetter(1))).keys()) # m1_scores = m1(top10_docking_smiles) # scores_array = list(m1_scores.values()) # scores_array = np.array([float(i) for i in scores_array]) # scores_array[np.where(scores_array == -1.0)[0]] = 10 # m1 score errors are usually large complex molecules # if save_dict: # results['m1_dict'] = m1_scores # results['m1'] = np.mean(scores_array) print_sys("---- Calculating synthetic accessibility score ----") from ..oracles import Oracle sa = Oracle(name = 'SA') scores_array = sa(pred_) if save_dict: results['sa_dict'] = scores_array results['sa'] = np.mean(scores_array) print_sys("---- Calculating molecular filters scores ----") from ..chem_utils.oracle.filter import MolFilter ## follow guacamol filters = MolFilter(filters = ['PAINS', 'SureChEMBL', 'Glaxo'], property_filters_flag = False) pred_filter = filters(pred_) if save_dict: results['pass_list'] = pred_filter results['%pass'] = float(len(pred_filter))/100 results['top1_%pass'] = min([docking_scores[i] for i in pred_filter]) print_sys("---- Calculating diversity ----") from ..evaluator import Evaluator evaluator = Evaluator(name = 'Diversity') score = evaluator(pred_) results['diversity'] = score print_sys("---- Calculating novelty ----") evaluator = Evaluator(name = 'Novelty') training = pd.read_csv(os.path.join(self.path, 'zinc.tab'), sep = '\t') score = evaluator(pred_, training.smiles.values) results['novelty'] = score results['top smiles'] = [i[0] for i in sorted(docking_scores.items(), key=lambda x: x[1])] results_max_call[num_max_call] = results results_all[data_name] = results_max_call return results_all
[docs] def evaluate_many(self, preds, save_file_name = None, m1_api = None, results_individual = None): """evaluate many runs together and output submission ready pkl file. Args: preds (list): a list of pred across runs, where each follows the format of pred in 'evaluate' function. save_file_name (str, optional): the name of the file to save the result. m1_api (str, optional): m1 API token for molecule synthesis score. results_individual (list, optional): if you already have generated the result from the evaluate function for each run, simply put in a list and it will not regenerate the results. Returns: dict: the output result file. """ min_requirement = 3 if len(preds) < min_requirement: return ValueError("Must have predictions from at least " + str(min_requirement) + " runs for leaderboard submission") if results_individual is None: individual_results = [] for pred in preds: retval = self.evaluate(pred, m1_api = m1_api) individual_results.append(retval) else: individual_results = results_individual metrics = ['top100', 'top10', 'top1', 'diversity', 'novelty', '%pass', 'top1_%pass', 'sa', 'top smiles'] num_folds = len(preds) results_agg = {} for target in list(individual_results[0].keys()): results_agg_target = {} for num_calls in individual_results[0][target].keys(): results_agg_target_call = {} for metric in metrics: if metric == 'top smiles': results_agg_target_call[metric] = np.unique(np.array([individual_results[fold][target][num_calls][metric] for fold in range(num_folds)]).reshape(-1)).tolist() else: res = [individual_results[fold][target][num_calls][metric] for fold in range(num_folds)] results_agg_target_call[metric] = [round(np.mean(res), 3), round(np.std(res), 3)] results_agg_target[num_calls] = results_agg_target_call results_agg[target] = results_agg_target import pickle if save_file_name is None: save_file_name = 'tdc_docking_result' with open(save_file_name + '.pkl', 'wb') as f: pickle.dump(results_agg, f) return results_agg