Source code for tdc.chem_utils.featurize.molconvert

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT

import numpy as np 
from typing import List


try: 
  from rdkit import Chem, DataStructs
  from rdkit.Chem import AllChem
  from rdkit import rdBase
  rdBase.DisableLog('rdApp.error')
  from rdkit.Chem.Fingerprints import FingerprintMols
  from rdkit.Chem import MACCSkeys
except:
  raise ImportError("Please install rdkit by 'conda install -c conda-forge rdkit'! ")	


from ...utils import print_sys
from ..oracle.oracle import smiles_to_rdkit_mol, smiles_2_fingerprint_ECFP4, smiles_2_fingerprint_FCFP4, smiles_2_fingerprint_AP, smiles_2_fingerprint_ECFP6
from ._smiles2pubchem import smiles2pubchem

[docs]def canonicalize(smiles): mol = Chem.MolFromSmiles(smiles) if mol is not None: return Chem.MolToSmiles(mol, isomericSmiles=True) else: return None
[docs]def smiles2morgan(s, radius = 2, nBits = 1024): """Convert smiles into Morgan Fingerprint. Args: smiles: str radius: int (default: 2) nBits: int (default: 1024) Returns: fp: numpy.array """ try: s = canonicalize(s) mol = Chem.MolFromSmiles(s) features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits) features = np.zeros((1,)) DataStructs.ConvertToNumpyArray(features_vec, features) except: print_sys('rdkit not found this smiles for morgan: ' + s + ' convert to all 0 features') features = np.zeros((nBits, )) return features
[docs]def smiles2rdkit2d(s): """Convert smiles into 200-dim Normalized RDKit 2D vector. Args: smiles: str Returns: fp: numpy.array """ s = canonicalize(s) try: from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors except: raise ImportError("Please install pip install git+https://github.com/bp-kelley/descriptastorus and pip install pandas-flavor") try: generator = rdNormalizedDescriptors.RDKit2DNormalized() features = np.array(generator.process(s)[1:]) NaNs = np.isnan(features) features[NaNs] = 0 except: print_sys('descriptastorus not found this smiles: ' + s + ' convert to all 0 features') features = np.zeros((200, )) return np.array(features)
[docs]def smiles2daylight(s): """Convert smiles into 2048-dim Daylight feature. Args: smiles: str Returns: fp: numpy.array """ try: s = canonicalize(s) NumFinger = 2048 mol = Chem.MolFromSmiles(s) bv = FingerprintMols.FingerprintMol(mol) temp = tuple(bv.GetOnBits()) features = np.zeros((NumFinger, )) features[np.array(temp)] = 1 except: print_sys('rdkit not found this smiles: ' + s + ' convert to all 0 features') features = np.zeros((2048, )) return np.array(features)
[docs]def smiles2maccs(s): """Convert smiles into maccs feature. Args: smiles: str Returns: fp: numpy.array """ s = canonicalize(s) mol = Chem.MolFromSmiles(s) fp = MACCSkeys.GenMACCSKeys(mol) arr = np.zeros((0,), dtype=np.float64) DataStructs.ConvertToNumpyArray(fp,arr) return arr
''' ECFP2 ---- 1 ECFP4 ---- 2 ECFP6 ---- 3 xxxxxxxxx ------ https://github.com/rdkit/benchmarking_platform/blob/master/scoring/fingerprint_lib.py '''
[docs]def smiles2ECFP2(smiles): """Convert smiles into ECFP2 Morgan Fingerprint. Args: smiles: str Returns: fp: rdkit.DataStructs.cDataStructs.UIntSparseIntVect """ nbits = 2048 smiles = canonicalize(smiles) molecule = smiles_to_rdkit_mol(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 1, nBits=nbits) arr = np.zeros((0,), dtype=np.float64) DataStructs.ConvertToNumpyArray(fp,arr) return arr
[docs]def smiles2ECFP4(smiles): """Convert smiles into ECFP4 Morgan Fingerprint. Args: smiles: str Returns: fp: rdkit.DataStructs.cDataStructs.UIntSparseIntVect """ nbits = 2048 smiles = canonicalize(smiles) molecule = smiles_to_rdkit_mol(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=nbits) arr = np.zeros((0,), dtype=np.float64) DataStructs.ConvertToNumpyArray(fp,arr) return arr
[docs]def smiles2ECFP6(smiles): """Convert smiles into ECFP6 Morgan Fingerprint. Args: smiles: str, a SMILES string Returns: fp: rdkit.DataStructs.cDataStructs.UIntSparseIntVect refer: https://github.com/rdkit/benchmarking_platform/blob/master/scoring/fingerprint_lib.py """ nbits = 2048 smiles = canonicalize(smiles) molecule = smiles_to_rdkit_mol(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 3, nBits=nbits) arr = np.zeros((0,), dtype=np.float64) DataStructs.ConvertToNumpyArray(fp,arr) return arr
# def smiles2smart(smiles):
[docs]class MoleculeFingerprint: ''' Example: MolFP = MoleculeFingerprint(fp = 'ECFP6') out = MolFp('Clc1ccccc1C2C(=C(/N/C(=C2/C(=O)OCC)COCCN)C)\C(=O)OC') # np.array([1, 0, 1, .....]) out = MolFp(['Clc1ccccc1C2C(=C(/N/C(=C2/C(=O)OCC)COCCN)C)\C(=O)OC', 'CCCOc1cc2ncnc(Nc3ccc4ncsc4c3)c2cc1S(=O)(=O)C(C)(C)C']) # np.array([[1, 0, 1, .....], [0, 0, 1, .....]]) Supporting FPs: Basic_Descriptors(atoms, chirality, ....), ECFP2, ECFP4, ECFP6, MACCS, Daylight-type, RDKit2D, Morgan, PubChem ''' def __init__(self, fp = 'ECFP4'): fp2func = {'ECFP2': smiles2ECFP2, 'ECFP4': smiles2ECFP4, 'ECFP6': smiles2ECFP6, 'MACCS': smiles2maccs, 'Daylight': smiles2daylight, 'RDKit2D': smiles2rdkit2d, 'Morgan': smiles2morgan, 'PubChem': smiles2pubchem} try: assert fp in fp2func except: raise Exception("The fingerprint you specify are not supported. \ It can only among 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'") self.fp = fp self.func = fp2func[fp] def __call__(self, x): if type(x)==str: return self.func(x) elif type(x)==list: lst = list(map(self.func, x)) arr = np.vstack(lst) return arr
[docs]def smiles2selfies(smiles): """Convert smiles into selfies. Args: smiles: str, a SMILES string Returns: selfies: str, a SELFIES string. """ smiles = canonicalize(smiles) return sf.encoder(smiles)
[docs]def selfies2smiles(selfies): """Convert selfies into smiles. Args: selfies: str, a SELFIES string. Returns: smiles: str, a SMILES string """ return canonicalize(sf.decoder(selfies))
[docs]def smiles2mol(smiles): """Convert SMILES string into rdkit.Chem.rdchem.Mol. Args: smiles: str, a SMILES string. Returns: mol: rdkit.Chem.rdchem.Mol """ smiles = canonicalize(smiles) mol = Chem.MolFromSmiles(smiles) if mol is None: return None Chem.Kekulize(mol) return mol
[docs]def bondtype2idx(bond_type): if bond_type == Chem.rdchem.BondType.SINGLE: return 1 elif bond_type == Chem.rdchem.BondType.DOUBLE: return 2 elif bond_type == Chem.rdchem.BondType.TRIPLE: return 3 elif bond_type == Chem.rdchem.BondType.AROMATIC: return 4
[docs]def smiles2graph2D(smiles): """convert SMILES string into two-dimensional molecular graph feature Args: smiles, str, a SMILES string Returns: idx2atom: dict, map from index to atom's symbol, e.g., {0:'C', 1:'N', ...} adj_matrix: np.array """ smiles = canonicalize(smiles) mol = smiles2mol(smiles) n_atoms = mol.GetNumAtoms() idx2atom = {atom.GetIdx():atom.GetSymbol() for atom in mol.GetAtoms()} adj_matrix = np.zeros((n_atoms, n_atoms), dtype = int) for bond in mol.GetBonds(): a1 = bond.GetBeginAtom() a2 = bond.GetEndAtom() idx1 = a1.GetIdx() idx2 = a2.GetIdx() bond_type = bond.GetBondType() bond_idx = bondtype2idx(bond_type) adj_matrix[idx1,idx2] = bond_idx adj_matrix[idx2,idx1] = bond_idx return idx2atom, adj_matrix
[docs]def get_mol(smiles): mol = Chem.MolFromSmiles(smiles) if mol is None: return None Chem.Kekulize(mol) return mol
############### PyG begin ############### ELEM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'Al', 'I', 'B', 'K', 'Se', 'Zn', 'H', 'Cu', 'Mn', 'unknown'] ATOM_FDIM = len(ELEM_LIST) + 6 + 5 + 4 + 1 BOND_FDIM = 5 + 6 MAX_NB = 6 # https://github.com/kexinhuang12345/DeepPurpose/blob/master/DeepPurpose/chemutils.py
[docs]def onek_encoding_unk(x, allowable_set): if x not in allowable_set: x = allowable_set[-1] return list(map(lambda s: x == s, allowable_set))
[docs]def get_atom_features(atom): return torch.Tensor(onek_encoding_unk(atom.GetSymbol(), ELEM_LIST) + onek_encoding_unk(atom.GetDegree(), [0,1,2,3,4,5]) + onek_encoding_unk(atom.GetFormalCharge(), [-1,-2,1,2,0]) + onek_encoding_unk(int(atom.GetChiralTag()), [0,1,2,3]) + [atom.GetIsAromatic()])
[docs]def smiles2PyG(smiles): """convert SMILES string into torch_geometric.data.Data Args: smiles, str, a SMILES string Returns: data, torch_geometric.data.Data """ smiles = canonicalize(smiles) mol = Chem.MolFromSmiles(smiles) n_atoms = mol.GetNumAtoms() atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()] atom_features = torch.stack(atom_features) y = [atom.GetSymbol() for atom in mol.GetAtoms()] y = list(map(lambda x: ELEM_LIST.index(x) if x in ELEM_LIST else len(ELEM_LIST)-1 , y)) y = torch.LongTensor(y) bond_features = [] for bond in mol.GetBonds(): a1 = bond.GetBeginAtom() a2 = bond.GetEndAtom() idx1 = a1.GetIdx() idx2 = a2.GetIdx() bond_features.extend([[idx1, idx2], [idx2, idx1]]) bond_features = torch.LongTensor(bond_features) data = Data(x=atom_features, edge_index=bond_features.T) return data
[docs]def molfile2PyG(molfile): smiles = molfile2smiles(molfile) smiles = canonicalize(smiles) return smiles2PyG(smiles)
############### PyG end ############### ############### DGL begin ###############
[docs]def smiles2DGL(smiles): """convert SMILES string into dgl.DGLGraph Args: smiles, str, a SMILES string Returns: g: dgl.DGLGraph() """ smiles = canonicalize(smiles) mol = Chem.MolFromSmiles(smiles) n_atoms = mol.GetNumAtoms() bond_features = [] for bond in mol.GetBonds(): a1 = bond.GetBeginAtom() a2 = bond.GetEndAtom() idx1 = a1.GetIdx() idx2 = a2.GetIdx() bond_features.extend([[idx1, idx2], [idx2, idx1]]) src, dst = tuple(zip(*bond_features)) g = dgl.DGLGraph() g.add_nodes(n_atoms) g.add_edges(src, dst) return g
############### DGL end ############### from ._xyz2mol import xyzfile2mol
[docs]def mol2smiles(mol): smiles = Chem.MolToSmiles(mol) smiles = canonicalize(smiles) return smiles
[docs]def xyzfile2smiles(xyzfile): """convert xyzfile into smiles string. Args: xyzfile: str, file Returns: smiles: str, a SMILES string """ mol, _ = xyzfile2mol(xyzfile) smiles = mol2smiles(mol) smiles = canonicalize(smiles) return smiles
[docs]def xyzfile2selfies(xyzfile): """convert xyzfile into SELFIES string. Args: xyzfile: str, file Returns: selfies: str, a SELFIES string. """ smiles = xyzfile2smiles(xyzfile) smiles = canonicalize(smiles) selfies = smiles2selfies(smiles) return selfies
[docs]def distance3d(coordinate_1, coordinate_2): return np.sqrt(sum([(c1-c2)**2 for c1,c2 in zip(coordinate_1, coordinate_2)]))
[docs]def upper_atom(atomsymbol): return atomsymbol[0].upper() + atomsymbol[1:]
[docs]def xyzfile2graph3d(xyzfile): atoms, charge, xyz_coordinates = read_xyz_file(file) num_atoms = len(atoms) distance_adj_matrix = np.zeros((num_atoms, num_atoms)) for i in range(num_atoms): for j in range(i+1, num_atoms): distance = distance3d(xyz_coordinates[i], xyz_coordinates[j]) distance_adj_matrix[i,j] = distance_adj_matrix[j,i] = distance idx2atom = {idx:upper_atom(str_atom(atom)) for idx,atom in enumerate(atoms)} mol, BO = xyzfile2mol(xyzfile) return idx2atom, distance_adj_matrix, BO
############## end xyz2mol ################
[docs]def sdffile2smiles_lst(sdffile): """convert SDF file into a list of SMILES string. Args: sdffile: str, file Returns: smiles_lst: a list of SMILES strings. """ from rdkit.Chem.PandasTools import LoadSDF df = LoadSDF(sdffile, smilesName='SMILES') smiles_lst = df['SMILES'].to_list() return smiles_lst
[docs]def sdffile2mol_conformer(sdffile): """convert sdffile into a list of molecule conformers. Args: sdffile: str, file Returns: smiles_lst: a list of molecule conformers. """ from rdkit.Chem.PandasTools import LoadSDF df = LoadSDF(sdffile, smilesName='SMILES') mol_lst = df['ROMol'].tolist() conformer_lst = [] for mol in mol_lst: conformer = mol.GetConformer(id=0) conformer_lst.append(conformer) mol_conformer_lst = list(zip(mol_lst, conformer_lst)) return mol_conformer_lst
[docs]def mol_conformer2graph3d(mol_conformer_lst): """convert list of (molecule, conformer) into a list of 3D graph. Args: mol_conformer_lst: list of tuple (molecule, conformer) Returns: graph3d_lst: a list of 3D graph. each graph has (i) idx2atom (dict); (ii) distance_adj_matrix (np.array); (iii) bondtype_adj_matrix (np.array) """ graph3d_lst = [] bond2num = {'SINGLE': 1, 'DOUBLE':2, 'TRIPLE':3, "AROMATIC":4} for mol, conformer in mol_conformer_lst: atom_num = mol.GetNumAtoms() distance_adj_matrix = np.zeros((atom_num, atom_num)) bondtype_adj_matrix = np.zeros((atom_num, atom_num), dtype = int) idx2atom = {i:v.GetSymbol() for i,v in enumerate(mol.GetAtoms())} positions = [] for i in range(atom_num): pos = conformer.GetAtomPosition(i) coordinate = np.array([pos.x, pos.y, pos.z]).reshape(1,3) positions.append(coordinate) positions = np.concatenate(positions, 0) for i in range(atom_num): for j in range(i+1, atom_num): distance_adj_matrix[i,j] = distance_adj_matrix[j,i] = distance3d(positions[i], positions[j]) for bond in mol.GetBonds(): a1 = bond.GetBeginAtom().GetIdx() a2 = bond.GetEndAtom().GetIdx() bt = bond.GetBondType() bondtype_adj_matrix[a1,a2] = bond2num[str(bt)] bondtype_adj_matrix[a1,a2] = bond2num[str(bt)] graph3d_lst.append((idx2atom, distance_adj_matrix, bondtype_adj_matrix)) return graph3d_lst
[docs]def sdffile2graph3d_lst(sdffile): """convert SDF file into a list of 3D graph. Args: sdffile: SDF file Returns: graph3d_lst: a list of 3D graph. each graph has (i) idx2atom (dict); (ii) distance_adj_matrix (np.array); (iii) bondtype_adj_matrix (np.array) """ mol_conformer_lst = sdffile2mol_conformer(sdffile) graph3d_lst = mol_conformer2graph3d(mol_conformer_lst) return graph3d_lst
[docs]def sdffile2selfies_lst(sdf): """convert sdffile into a list of SELFIES strings. Args: sdffile: str, file Returns: selfies_lst: a list of SELFIES strings. """ smiles_lst = sdffile2smiles_lst(sdf) selfies_lst = list(map(smiles2selfies, smiles_lst)) return selfies_lst
[docs]def smiles_lst2coulomb(smiles_lst): """convert a list of SMILES strings into coulomb format. Args: smiles_lst: a list of SELFIES strings. Returns: features: np.array """ molecules = [Molecule(smiles, 'smiles') for smiles in smiles_lst] for mol in molecules: mol.to_xyz(optimizer='UFF') cm = CoulombMatrix(cm_type='UM', n_jobs=-1) features = cm.represent(molecules) features = features.to_numpy() return features
## (nmol, max_atom_n**2), ## where max_atom_n is maximal number of atom in the smiles_lst ## features[i].reshape(max_atom_n, max_atom_n)[:3,:3] -> 3*3 Coulomb matrix
[docs]def sdffile2coulomb(sdf): """convert sdffile into a list of coulomb feature. Args: sdffile: str, file Returns: coulomb feature: np.array """ smiles_lst = sdffile2smiles_lst(sdf) return smiles_lst2coulomb(smiles_lst)
[docs]def xyzfile2coulomb(xyzfile): smiles = xyzfile2smiles(xyzfile) smiles = canonicalize(smiles) return smiles_lst2coulomb([smiles])
#2D_format = ['SMILES', 'SELFIES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'] #3D_format = ['Graph3D', 'Coulumb'] ## XXX2smiles
[docs]def molfile2smiles(molfile): """convert molfile into SMILES string Args: molfile: str, a file. Returns: smiles: str, SMILES strings """ mol = Chem.MolFromMolFile(molfile) smiles = Chem.MolToSmiles(mol) smiles = canonicalize(smiles) return smiles
[docs]def mol2file2smiles(molfile): """convert mol2file into SMILES string Args: mol2file: str, a file. Returns: smiles: str, SMILES strings """ mol = Chem.MolFromMol2File(molfile) smiles = Chem.MolToSmiles(mol) smiles = canonicalize(smiles) return smiles
## smiles2xxx atom_types = ['C', 'N', 'O', 'H', 'F', 'unknown'] ### Cl, S?
[docs]def atom2onehot(atom): """ convert atom to one-hot feature vector Args: 'C' Returns: [1, 0, 0, 0, 0, ..] """ onehot = np.zeros((1,len(atom_types))) idx = atom_types.index(atom) onehot[0,idx] = 1 return onehot
[docs]def atomstring2atomfeature(atom_string_list): atom_features = [atom2onehot(atom) for atom in atom_string_list] atom_features = np.concatenate(atom_features, 0) return atom_features
[docs]def raw3D2pyg(raw3d_feature): """convert raw3d feature to pyg (torch-geometric) feature Args: raw3d_feature: (atom_string_list, positions, y) - atom_string_list: list, each element is an atom, length is N - positions: np.array, shape: (N,3) - y: float Returns: data = Data(x=x, pos=pos, y=y) """ import torch from torch_geometric.data import Data ### global # atom_string_list, positions, y = raw3d_feature atom_string_list, positions = raw3d_feature atom_features = atomstring2atomfeature(atom_string_list) atom_features = torch.from_numpy(atom_features) positions = torch.from_numpy(positions) # y = torch.FloatTensor(y) # data = Data(x = atom_features, pos = positions, y = y) data = Data(x = atom_features, pos = positions) return data
convert_dict = { 'SMILES': ['SELFIES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'], 'SELFIES': ['SMILES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'], 'mol': ['SMILES', 'SELFIES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'], 'mol2': ['SMILES', 'SELFIES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'], 'SDF': ['SMILES', 'SELFIES', 'Graph3D', 'Coulumb'], 'XYZ': ['SMILES', 'SELFIES', 'Graph3D', 'Coulumb'], 'Raw3D': ['PyG3D'], } fingerprints_list = ['ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem'] twoD_format = ['SMILES', 'SELFIES', 'mol', 'mol2', ] threeD_format = ['SDF', 'XYZ', 'PyG3D', 'Raw3D', 'distance', 'Coulumb', 'shape', ] ### shape:mesh
[docs]class MolConvert: """MolConvert: convert the molecule from src formet to dst format. Example: convert = MolConvert(src = ‘SMILES’, dst = ‘Graph2D’) g = convert(‘Clc1ccccc1C2C(=C(/N/C(=C2/C(=O)OCC)COCCN)C)\C(=O)OC’) # g: graph with edge, node features g = convert(['Clc1ccccc1C2C(=C(/N/C(=C2/C(=O)OCC)COCCN)C)\C(=O)OC', 'CCCOc1cc2ncnc(Nc3ccc4ncsc4c3)c2cc1S(=O)(=O)C(C)(C)C']) # g: a list of graphs with edge, node features if src is 2D, dst can be only 2D output if src is 3D, dst can be both 2D and 3D outputs src: 2D - [SMILES, SELFIES] 3D - [SDF file, XYZ file] dst: 2D - [2D Graph (+ PyG, DGL format), Canonical SMILES, SELFIES, Fingerprints] 3D - [3D graphs (adj matrix entry is (distance, bond type)), Coulumb Matrix] """ def __init__(self, src = 'SMILES', dst = 'Graph2D', radius = 2, nBits = 1024): self._src = src self._dst = dst self._radius = radius self._nbits = nBits self.convert_dict = convert_dict if 'SELFIES' == src or 'SELFIES' == dst: try: import selfies as sf global sf except: raise Exception("Please install selfies via 'pip install selfies'") if 'Coulumb' == dst: try: from chemml.chem import CoulombMatrix, Molecule global CoulombMatrix, Molecule except: raise Exception("Please install chemml via 'pip install pybel' and 'pip install chemml'. ") if 'PyG' == dst: try: import torch from torch_geometric.data import Data global torch global Data except: raise Exception("Please install PyTorch Geometric via 'https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html'.") if 'DGL' == dst: try: import dgl global dgl except: raise Exception("Please install DGL via 'pip install dgl'.") try: assert src in self.convert_dict except: raise Exception("src format is not supported") try: assert dst in self.convert_dict[src] except: raise Exception('It is not supported to convert src to dst.') if src in twoD_format: ### 1. src -> SMILES if src == "SMILES": f1 = canonicalize elif src == "SELFIES": f1 = selfies2smiles elif src == "mol": f1 = molfile2smiles elif src == "mol2": f1 = mol2file2smiles ### 2. SMILES -> all # 'SMILES', 'SELFIES', 'Graph2D', 'PyG', 'DGL', 'ECFP2', 'ECFP4', 'ECFP6', 'MACCS', 'Daylight', 'RDKit2D', 'Morgan', 'PubChem' if dst == 'SMILES': f2 = canonicalize elif dst == 'SELFIES': f2 = smiles2selfies elif dst == "Graph2D": f2 = smiles2graph2D elif dst == "PyG": f2 = smiles2PyG elif dst == "DGL": f2 = smiles2DGL elif dst == "ECFP2": f2 = smiles2ECFP2 elif dst == "ECFP4": f2 = smiles2ECFP4 elif dst == "ECFP6": f2 = smiles2ECFP6 elif dst == "MACCS": f2 = smiles2maccs elif dst == "Daylight": f2 = smiles2daylight elif dst == "RDKit2D": f2 = smiles2rdkit2d elif dst == "Morgan": f2 = smiles2morgan elif dst == 'PubChem': f2 = smiles2pubchem self.func = lambda x:f2(f1(x)) elif src in threeD_format: pass ### load from xyz file, input is a filename (str), only contain one smiles if src == 'XYZ' and dst == 'SMILES': self.func = xyzfile2smiles elif src == 'XYZ' and dst == 'SELFIES': self.func = xyzfile2selfies elif src == 'XYZ' and dst == 'Graph3D': self.func = xyzfile2graph3d elif src == 'XYZ' and dst == 'Coulumb': self.func = xyzfile2coulomb ### SDF file elif src == 'SDF' and dst == 'Graph3D': self.func = sdffile2graph3d_lst elif src == 'SDF' and dst == 'SMILES': self.func = sdffile2smiles_lst elif src == 'SDF' and dst == 'SELFIES': self.func = sdffile2selfies_lst elif src == 'SDF' and dst == 'Coulumb': self.func = sdffile2coulomb elif src == 'Raw3D' and dst == 'PyG3D': self.func = raw3D2pyg def __call__(self, x): if type(x) == np.ndarray: x = x.tolist() if type(x) == str: if self.func != smiles2morgan: return self.func(x) else: return self.func(x, radius = self._radius, nBits = self._nbits) elif type(x) == list: if self.func != smiles2morgan: out = list(map(self.func, x)) else: lst = [] for x0 in x: lst.append(self.func(x0, radius = self._radius, nBits = self._nbits)) out = lst if self._dst in fingerprints_list: out = np.array(out) return out
[docs] @staticmethod def eligible_format(src = None): ''' given a src format, output all the available format of the src format Example MoleculeLink.eligible_format('SMILES') ## ['Graph', 'SMARTS', ...] ''' if src is not None: try: assert src in convert_dict except: raise Exception("src format is not supported") return convert_dict[src] else: return convert_dict