Source code for

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT

import numpy as np

    from rdkit import rdBase

    raise ImportError("Please install rdkit by 'conda install -c conda-forge rdkit'! ")

from ...utils import print_sys, install

[docs]class MolFilter: """Molecule Filter: filter Molecule based on user-specified condition Args: filters: property_filters_flag: bool, HBA: [lower_bound, upper_bound] HBD: [lower_bound, upper_bound] LogP: [lower_bound, upper_bound] MW: [lower_bound, upper_bound], Molecule weight Rot: [lower_bound, upper_bound] TPSA: [lower_bound, upper_bound] Returns: list of SMILES strings that pass the filter. """ # MIT License: Checkout def __init__( self, filters="all", property_filters_flag=True, HBA=[0, 10], HBD=[0, 5], LogP=[-5, 5], MW=[0, 500], Rot=[0, 10], TPSA=[0, 200], ): try: from rd_filters.rd_filters import RDFilters, read_rules except: install("git+") from rd_filters.rd_filters import RDFilters, read_rules import pkg_resources self.property_filters_flag = property_filters_flag all_filters = [ "BMS", "Dundee", "Glaxo", "Inpharmatica", "LINT", "MLSMR", "PAINS", "SureChEMBL", ] if filters == "all": filters = all_filters else: if isinstance(filters, str): filters = [filters] if isinstance(filters, list): ## a set of filters for i in filters: if i not in all_filters: raise ValueError( i + " not found; Please choose from a list of available filters from 'BMS', 'Dundee', 'Glaxo', 'Inpharmatica', 'LINT', 'MLSMR', 'PAINS', 'SureChEMBL'" ) alert_file_name = pkg_resources.resource_filename( "rd_filters", "data/alert_collection.csv" ) rules_file_path = pkg_resources.resource_filename( "rd_filters", "data/rules.json" ) self.rf = RDFilters(alert_file_name) self.rule_dict = read_rules(rules_file_path) self.rule_dict["Rule_Inpharmatica"] = False for i in filters: self.rule_dict["Rule_" + i] = True if self.property_filters_flag: ( self.rule_dict["HBA"], self.rule_dict["HBD"], self.rule_dict["LogP"], self.rule_dict["MW"], self.rule_dict["Rot"], self.rule_dict["TPSA"], ) = (HBA, HBD, LogP, MW, Rot, TPSA) else: if "HBA" in self.rule_dict: del self.rule_dict["HBA"] if "HBD" in self.rule_dict: del self.rule_dict["HBD"] if "LogP" in self.rule_dict: del self.rule_dict["LogP"] if "MW" in self.rule_dict: del self.rule_dict["MW"] if "Rot" in self.rule_dict: del self.rule_dict["Rot"] if "TPSA" in self.rule_dict: del self.rule_dict["TPSA"] # del self.rule_dict['HBA'], self.rule_dict['HBD'], self.rule_dict['LogP'], self.rule_dict['MW'], self.rule_dict['Rot'], self.rule_dict['TPSA'] print_sys("MolFilter is using the following filters:") for i, j in self.rule_dict.items(): if i[:4] == "Rule": if j: print_sys(i + ": " + str(j)) else: print_sys(i + ": " + str(j)) rule_list = [ x.replace("Rule_", "") for x in self.rule_dict.keys() if x.startswith("Rule") and self.rule_dict[x] ] rule_str = " and ".join(rule_list) self.rf.build_rule_list(rule_list) def __call__(self, input_data): import multiprocessing as mp from multiprocessing import Pool import pandas as pd if isinstance(input_data, str): input_data = [input_data] elif not isinstance(input_data, (list, np.ndarray, np.generic)): raise ValueError( "Input must be a list/numpy array of SMILES or one SMILES string!" ) input_data = list(tuple(zip(input_data, list(range(len(input_data)))))) num_cores = int(mp.cpu_count()) p = Pool(num_cores) res = list(, input_data)) if self.property_filters_flag: df = pd.DataFrame( res, columns=[ "SMILES", "NAME", "FILTER", "MW", "LogP", "HBD", "HBA", "TPSA", "Rot", ], ) df_ok = df[ (df.FILTER == "OK") & df.MW.between(*self.rule_dict["MW"]) & df.LogP.between(*self.rule_dict["LogP"]) & df.HBD.between(*self.rule_dict["HBD"]) & df.HBA.between(*self.rule_dict["HBA"]) & df.TPSA.between(*self.rule_dict["TPSA"]) & df.Rot.between(*self.rule_dict["Rot"]) ] else: df = pd.DataFrame( res, columns=[ "SMILES", "NAME", "FILTER", "MW", "LogP", "HBD", "HBA", "TPSA", "Rot", ], ) df_ok = df[(df.FILTER == "OK")] return df_ok.SMILES.values