Source code for tdc.chem_utils.evaluator

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT

import numpy as np
import os

try:
    from rdkit import Chem, DataStructs
    from rdkit.Chem import AllChem
    from rdkit import rdBase

    rdBase.DisableLog("rdApp.error")
except:
    raise ImportError("Please install rdkit by 'conda install -c conda-forge rdkit'! ")


[docs]def single_molecule_validity(smiles): """Evaluate the chemical validity of a single molecule in terms of SMILES string Args: smiles: str, SMILES string. Returns: Boolean: if the SMILES string is a valid molecule """ if smiles.strip() == "": return False mol = Chem.MolFromSmiles(smiles) if mol is None or mol.GetNumAtoms() == 0: return False return True
[docs]def validity(list_of_smiles): valid_list_smiles = list(filter(single_molecule_validity, list_of_smiles)) return 1.0 * len(valid_list_smiles) / len(list_of_smiles)
[docs]def canonicalize(smiles): """Convert SMILES into canonical form. Args: smiles: str, SMILES string Returns: smiles: str, canonical SMILES string. """ mol = Chem.MolFromSmiles(smiles) if mol is not None: return Chem.MolToSmiles(mol, isomericSmiles=True) else: return None
[docs]def unique_lst_of_smiles(list_of_smiles): canonical_smiles_lst = list(map(canonicalize, list_of_smiles)) canonical_smiles_lst = list(filter(lambda x: x is not None, canonical_smiles_lst)) canonical_smiles_lst = list(set(canonical_smiles_lst)) return canonical_smiles_lst
[docs]def uniqueness(list_of_smiles): """Evaluate the uniqueness of a list of SMILES string, i.e., the fraction of unique molecules among a given list. Args: list_of_smiles: list (of SMILES string) Returns: uniqueness: float """ canonical_smiles_lst = unique_lst_of_smiles(list_of_smiles) return 1.0 * len(canonical_smiles_lst) / len(list_of_smiles)
[docs]def novelty(generated_smiles_lst, training_smiles_lst): """Evaluate the novelty of set of generated smiles using list of training smiles as reference. Novelty is defined as the fraction of generated molecules that doesn't appear in the training set. Args: generated_smiles_lst: list (of SMILES string), which are generated. training_smiles_lst: list (of SMILES string), which are used for training. Returns: novelty: float """ generated_smiles_lst = unique_lst_of_smiles(generated_smiles_lst) training_smiles_lst = unique_lst_of_smiles(training_smiles_lst) novel_ratio = ( sum([1 if i in training_smiles_lst else 0 for i in generated_smiles_lst]) * 1.0 / len(generated_smiles_lst) ) return 1 - novel_ratio
[docs]def diversity(list_of_smiles): """Evaluate the internal diversity of a set of molecules. The internbal diversity is defined as the average pairwise Tanimoto distance between the Morgan fingerprints. Args: list_of_smiles: list of SMILES strings Returns: div: float """ list_of_unique_smiles = unique_lst_of_smiles(list_of_smiles) list_of_mol = [Chem.MolFromSmiles(smiles) for smiles in list_of_unique_smiles] list_of_fp = [ AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False) for mol in list_of_mol ] avg_lst = [] for idx, fp in enumerate(list_of_fp): for fp2 in list_of_fp[idx + 1 :]: sim = DataStructs.TanimotoSimilarity(fp, fp2) ### option I distance = 1 - sim ### option II # distance = -np.log2(sim) avg_lst.append(distance) div = np.mean(avg_lst) return div
######################################## ######## KL divergence ######## def _calculate_pc_descriptors(smiles, pc_descriptors): """Calculate Physical Chemical descriptors of a single SMILES (internal function). Args: list_of_smiles: SMILES strings pc_descriptors: list of strings, names of descriptors to calculate Returns: descriptros: list of float """ from rdkit.ML.Descriptors import MoleculeDescriptors calc = MoleculeDescriptors.MolecularDescriptorCalculator(pc_descriptors) mol = Chem.MolFromSmiles(smiles) if mol is None: return None _fp = calc.CalcDescriptors(mol) _fp = np.array(_fp) mask = np.isfinite(_fp) if (mask == 0).sum() > 0: logger.warning(f"{smiles} contains an NAN physchem descriptor") _fp[~mask] = 0 return _fp
[docs]def calculate_pc_descriptors(smiles, pc_descriptors): """Calculate Physical Chemical descriptors of a list of molecules. Args: list_of_smiles: list of SMILES strings pc_descriptors: list of strings, names of descriptors to calculate Returns: descriptros: list of float """ output = [] for i in smiles: d = _calculate_pc_descriptors(i, pc_descriptors) if d is not None: output.append(d) return np.array(output)
[docs]def continuous_kldiv(X_baseline: np.array, X_sampled: np.array) -> float: """calculate KL divergence for two numpy arrays, conitnuous version. Args: X_baseline: numpy array X_sampled: numpy array Returns: KL divergence: float """ X_baseline += 1e-5 X_sampled += 1e-5 from scipy.stats import entropy, gaussian_kde kde_P = gaussian_kde(X_baseline) kde_Q = gaussian_kde(X_sampled) x_eval = np.linspace( np.hstack([X_baseline, X_sampled]).min(), np.hstack([X_baseline, X_sampled]).max(), num=1000, ) P = kde_P(x_eval) + 1e-10 Q = kde_Q(x_eval) + 1e-10 return entropy(P, Q)
[docs]def discrete_kldiv(X_baseline: np.array, X_sampled: np.array) -> float: """calculate KL divergence for two numpy arrays, discrete version. Args: X_baseline: numpy array X_sampled: numpy array Returns: KL divergence: float """ from scipy.stats import entropy from scipy import histogram P, bins = histogram(X_baseline, bins=10, density=True) P += 1e-10 Q, _ = histogram(X_sampled, bins=bins, density=True) Q += 1e-10 return entropy(P, Q)
[docs]def get_fingerprints(mols, radius=2, length=4096): """ Converts molecules to ECFP bitvectors. Args: mols: RDKit molecules radius: ECFP fingerprint radius length: number of bits Returns: a list of fingerprints """ return [AllChem.GetMorganFingerprintAsBitVect(m, radius, length) for m in mols]
[docs]def get_mols(smiles_list): """Convert SMILES strings to RDKit RDMol objects. Args: list_of_smiles: list of SMILES strings Returns: mols: list of RDKit RDMol objects """ for i in smiles_list: try: mol = Chem.MolFromSmiles(i) if mol is not None: yield mol except Exception as e: logger.warning(e)
[docs]def calculate_internal_pairwise_similarities(smiles_list): """ Computes the pairwise similarities of the provided list of smiles against itself. Args: smiles_list: list of str Returns: Symmetric matrix of pairwise similarities. Diagonal is set to zero. """ if len(smiles_list) > 10000: logger.warning( f"Calculating internal similarity on large set of " f"SMILES strings ({len(smiles_list)})" ) mols = get_mols(smiles_list) fps = get_fingerprints(mols) nfps = len(fps) similarities = np.zeros((nfps, nfps)) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) similarities[i, :i] = sims similarities[:i, i] = sims return similarities
[docs]def kl_divergence(generated_smiles_lst, training_smiles_lst): """Evaluate the KL divergence of set of generated smiles using list of training smiles as reference. KL divergence is defined as the averaged KL divergence of a set of physical chemical descriptors between a set of generated molecules and a set of training molecules. Args: generated_smiles_lst: list (of SMILES string), which are generated. training_smiles_lst: list (of SMILES string), which are used for training. Returns: KL divergence: float """ pc_descriptor_subset = [ "BertzCT", "MolLogP", "MolWt", "TPSA", "NumHAcceptors", "NumHDonors", "NumRotatableBonds", "NumAliphaticRings", "NumAromaticRings", ] def canonical(smiles): mol = Chem.MolFromSmiles(smiles) if mol is not None: return Chem.MolToSmiles(mol, isomericSmiles=True) ### todo double check else: return None generated_lst_mol = list(map(canonical, generated_smiles_lst)) training_lst_mol = list(map(canonical, training_smiles_lst)) filter_out_func = lambda x: x is not None generated_lst_mol = list(filter(filter_out_func, generated_lst_mol)) training_lst_mol = list(filter(filter_out_func, training_lst_mol)) d_sampled = calculate_pc_descriptors(generated_lst_mol, pc_descriptor_subset) d_chembl = calculate_pc_descriptors(training_lst_mol, pc_descriptor_subset) kldivs = {} for i in range(4): kldiv = continuous_kldiv(X_baseline=d_chembl[:, i], X_sampled=d_sampled[:, i]) kldivs[pc_descriptor_subset[i]] = kldiv # ... and for the int valued ones. for i in range(4, 9): kldiv = discrete_kldiv(X_baseline=d_chembl[:, i], X_sampled=d_sampled[:, i]) kldivs[pc_descriptor_subset[i]] = kldiv # pairwise similarity chembl_sim = calculate_internal_pairwise_similarities(training_lst_mol) chembl_sim = chembl_sim.max(axis=1) sampled_sim = calculate_internal_pairwise_similarities(generated_lst_mol) sampled_sim = sampled_sim.max(axis=1) kldiv_int_int = continuous_kldiv(X_baseline=chembl_sim, X_sampled=sampled_sim) kldivs["internal_similarity"] = kldiv_int_int """ # for some reason, this runs into problems when both sets are identical. # cross_set_sim = calculate_pairwise_similarities(self.training_set_molecules, unique_molecules) # cross_set_sim = cross_set_sim.max(axis=1) # # kldiv_ext = discrete_kldiv(chembl_sim, cross_set_sim) # kldivs['external_similarity'] = kldiv_ext # kldiv_sum += kldiv_ext """ # Each KL divergence value is transformed to be in [0, 1]. # Then their average delivers the final score. partial_scores = [np.exp(-score) for score in kldivs.values()] score = sum(partial_scores) / len(partial_scores) return score
[docs]def fcd_distance_tf(generated_smiles_lst, training_smiles_lst): """Evaluate FCD distance between generated smiles set and training smiles set using tensorflow. Args: generated_smiles_lst: list (of SMILES string), which are generated. training_smiles_lst: list (of SMILES string), which are used for training. Returns: fcd_distance: float """ import pkgutil, tempfile, os if "chemnet" not in globals().keys(): global chemnet ### _load_chemnet chemnet_model_filename = "ChemNet_v0.13_pretrained.h5" model_bytes = pkgutil.get_data("fcd", chemnet_model_filename) tmpdir = tempfile.gettempdir() model_path = os.path.join(tmpdir, chemnet_model_filename) with open(model_path, "wb") as f: f.write(model_bytes) chemnet = fcd.load_ref_model(model_path) # _load_chemnet def _calculate_distribution_statistics(chemnet, molecules): sample_std = fcd.canonical_smiles(molecules) gen_mol_act = fcd.get_predictions(chemnet, sample_std) mu = np.mean(gen_mol_act, axis=0) cov = np.cov(gen_mol_act.T) return mu, cov mu_ref, cov_ref = _calculate_distribution_statistics(chemnet, training_smiles_lst) mu, cov = _calculate_distribution_statistics(chemnet, generated_smiles_lst) FCD = fcd.calculate_frechet_distance(mu1=mu_ref, mu2=mu, sigma1=cov_ref, sigma2=cov) fcd_distance = np.exp(-0.2 * FCD) return fcd_distance
[docs]def fcd_distance_torch(generated_smiles_lst, training_smiles_lst): """Evaluate FCD distance between generated smiles set and training smiles set using PyTorch. Args: generated_smiles_lst: list (of SMILES string), which are generated. training_smiles_lst: list (of SMILES string), which are used for training. Returns: fcd_distance: float """ os.environ["KMP_DUPLICATE_LIB_OK"] = "True" from fcd_torch import FCD fcd = FCD(device="cpu", n_jobs=8) fcd_distance = fcd(generated_smiles_lst, training_smiles_lst) return fcd_distance
[docs]def fcd_distance(generated_smiles_lst, training_smiles_lst): """Evaluate FCD distance between generated smiles set and training smiles set. Args: generated_smiles_lst: list (of SMILES string), which are generated. training_smiles_lst: list (of SMILES string), which are used for training. Returns: fcd_distance: float """ try: import tensorflow, fcd global fcd except: try: import torch, fcd_torch return fcd_distance_torch(generated_smiles_lst, training_smiles_lst) except: raise ImportError( "Please install fcd by 'pip install FCD' (for Tensorflow backend) \ or 'pip install fcd_torch' (for PyTorch backend)!" ) return fcd_distance_tf(generated_smiles_lst, training_smiles_lst)