Source code for tdc.metadata

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT


"""This file contains all metadata of datasets in TDC.

Attributes:
    adme_dataset_names (list): all adme dataset names
    admet_benchmark (dict): a dictionary with key the TDC task and value a list of dataset names
    admet_metrics (dict): a dictionary with key the dataset name and value the recommended metric
    admet_splits (dict): a dictionary with key the dataset name and value the recommended split
    antibodyaff_dataset_names (list): all antibody_aff dataset names
    benchmark2id (dict): benchmark names to dataverse download ID
    benchmark2type (dict): benchmark names to file type in download format
    benchmark_names (dict): a dictionary mapping benchmark group name to each benchmark group dataset names
    bm_metric_names (dict): a dictionary mapping benchmark group name to each benchmark group metric names
    bm_split_names (dict): a dictionary mapping benchmark group name to each benchmark group split names
    catalyst_dataset_names (list): all catalyst dataset names
    category_names (dict): mapping from ML problem (1st tier) to all tasks
    crisproutcome_dataset_names (list): all crispr outcome dataset names
    dataset_list (list): total list of dataset names in TDC
    dataset_names (dict): mapping from task name to list of dataset names
    ddi_dataset_names (list): all ddi dataset names
    develop_dataset_names (list): all develop dataset names
    distribution_oracles (list): all distribution learning oracles, i.e. molecule evaluators
    docking_benchmark (dict): docking benchmark target names
    docking_target_info (dict): docking benchmark target pockets info
    download_oracle_names (list): oracle names that require downloading predictors
    drugres_dataset_names (list): all drugres dataset names
    drugsyn_benchmark (dict): drugcombo benchmark group targets
    drugsyn_dataset_names (list): all drugsyn dataset names
    drugsyn_metrics (dict): a dictionary with key the dataset name and value the recommended metric
    drugsyn_splits (dict):  a dictionary with key the dataset name and value the recommended split
    dti_dataset_names (list): all dti dataset names
    dti_dg_benchmark (dict): dti_dg benchmark group dataset names
    dti_dg_metrics (dict): a dictionary with key the dataset name and value the recommended metric
    dti_dg_splits (dict):  a dictionary with key the dataset name and value the recommended split
    epitope_dataset_names (list): all epitope dataset names
    evaluator_name (list): list of evaluator names
    forwardsyn_dataset_names (list): all reaction dataset names
    gda_dataset_names (list): all gda dataset names
    generation_datasets (list): all generation dataset names
    guacamol_oracle (list): list of oracles from guacamol
    hts_dataset_names (list): all hts dataset names
    meta_oracle_name (list): list of all meta oracle names
    molgenpaired_dataset_names (list): all molgenpaired dataset names
    mti_dataset_names (list): all mti dataset names
    name2id (dict): mapping from dataset names to dataverse id
    name2stats (dict): mapping from dataset names to statistics
    name2type (dict): mapping from dataset names to downloaded file format
    oracle2id (dict): mapping from oracle names to dataverse id
    oracle2type (dict): mapping from oracle names to downloaded file format
    receptor2id (dict): mapping from receptor id to dataverse id 
    oracle_names (list): list of all oracle names
    paired_dataset_names (list): all paired dataset names
    paratope_dataset_names (list): all paratope dataset names
    peptidemhc_dataset_names (list): all peptidemhc dataset names
    ppi_dataset_names (list): all ppi dataset names
    property_names (list): a list of oracles that correspond to some molecular properties
    qm_dataset_names (list): all qm dataset names
    retrosyn_dataset_names (list): all retrosyn dataset names
    sdf_file_names (list): list of sdf file names
    single_molecule_dataset_names (list): all molgen dataset names
	multiple_molecule_dataset_names (list): all ligandmolgen dataset names
    synthetic_oracle_name (list): all oracle names for synthesis
    test_multi_pred_dataset_names (list): test multi pred task name
    test_single_pred_dataset_names (list): test single pred task name
    toxicity_dataset_names (list): all toxicity dataset names
    trivial_oracle_names (list): a list of oracle names for trivial oracles
    yield_dataset_names (list): all yield dataset names
"""
####################################
# test cases
test_single_pred_dataset_names = ['test_single_pred']
test_multi_pred_dataset_names = ['test_multi_pred']

# single_pred prediction

toxicity_dataset_names = ['tox21', 'toxcast', 'clintox', 'herg', 'herg_central', 'dili', 'skin_reaction', 'ames', 'carcinogens_lagunin', 'ld50_zhu']

adme_dataset_names = ['lipophilicity_astrazeneca',
 'solubility_aqsoldb',
 'hydrationfreeenergy_freesolv',
 'caco2_wang',
 'hia_hou',
 'pgp_broccatelli',
 'bioavailability_ma',
 'vdss_lombardo',
 'cyp2c19_veith',
 'cyp2d6_veith',
 'cyp3a4_veith',
 'cyp1a2_veith',
 'cyp2c9_veith',
 'cyp2c9_substrate_carbonmangels', 
 'cyp2d6_substrate_carbonmangels',
 'cyp3a4_substrate_carbonmangels', 
 'bbb_martins',
 'ppbr_az',
 'half_life_obach',
 'clearance_hepatocyte_az',
 'clearance_microsome_az']

hts_dataset_names = ['hiv', 
'sarscov2_3clpro_diamond', 
'sarscov2_vitro_touret']

qm_dataset_names = ['qm7', 'qm7b', 'qm8', 'qm9']

epitope_dataset_names = ['iedb_jespersen', 'pdb_jespersen']

paratope_dataset_names = ['sabdab_liberis']

develop_dataset_names = ['tap', 'sabdab_chen']

####################################
# multi_pred prediction

dti_dataset_names = ['davis',
 'kiba',
 'bindingdb_kd',
 'bindingdb_ic50',
 'bindingdb_ki',
 'bindingdb_patent']

ppi_dataset_names = ['huri']

peptidemhc_dataset_names = ['mhc2_iedb_jensen', 'mhc1_iedb-imgt_nielsen']

ddi_dataset_names = ['drugbank', 'twosides']

mti_dataset_names = ['mirtarbase']

gda_dataset_names = ['disgenet']

crisproutcome_dataset_names = ['leenay']

drugres_dataset_names = ['gdsc1', 'gdsc2']

drugsyn_dataset_names = ['oncopolypharmacology', 'drugcomb']

antibodyaff_dataset_names = ['protein_sabdab']

yield_dataset_names = ['uspto_yields', 'buchwald-hartwig']

catalyst_dataset_names = ['uspto_catalyst']

tcr_epi_dataset_names = ['weber']

####################################
# generation


retrosyn_dataset_names = ['uspto50k', 'uspto']

forwardsyn_dataset_names = ['uspto']

single_molecule_dataset_names = ['zinc', 'moses', 'chembl', 'chembl_v29']

multiple_molecule_dataset_names = ['scpdb']

paired_dataset_names = ['uspto50k', 'uspto']


####################################
# resource

compound_library_names = ['drugbank_drugs', 'chembl_drugs', 'broad_repurposing_hub', 'antivirals']
biokg_library_names = ['hetionet']

####################################
# oracles

#### evaluator for distribution learning, the input of __call__ is list of smiles
distribution_oracles = ['novelty', 'diversity', 'uniqueness', 'validity', 'fcd_distance', 'kl_divergence']  


property_names = ['drd2', 'qed', 'logp', 'sa', 'gsk3b', 'jnk3',]

evaluator_name = ['roc-auc', 'f1', 'pr-auc', 'precision', 'recall', \
				  'accuracy', 'mse', 'rmse', 'mae', 'r2', 'micro-f1', 'macro-f1', \
				  'kappa', 'avg-roc-auc', 'rp@k', 'pr@k', 'pcc', 'spearman', 'range_logAUC']

evaluator_name.extend(distribution_oracles)

guacamol_oracle = ['rediscovery', 'similarity', 'median', 'isomers', 'mpo', 'hop', \
				   'celecoxib_rediscovery', 'troglitazone_rediscovery', 'thiothixene_rediscovery', \
				   'aripiprazole_similarity', 'albuterol_similarity', 'mestranol_similarity', \
				   'isomers_c7h8n2o2', 'isomers_c9h10n2o2pf2cl', 'isomers_c11h24', \
				   'osimertinib_mpo', 'fexofenadine_mpo', 'ranolazine_mpo', 'perindopril_mpo', \
				   'amlodipine_mpo', 'sitagliptin_mpo', 'zaleplon_mpo', \
				   'median1', 'median2', \
				   'valsartan_smarts', 'deco_hop', 'scaffold_hop']


####################################
# Benchmark Datasets

admet_benchmark = {'ADME': ['caco2_wang', 
							'hia_hou',
							'pgp_broccatelli', 
							'bioavailability_ma',
							'lipophilicity_astrazeneca',
							'solubility_aqsoldb',
							'bbb_martins',
							'ppbr_az',
							'vdss_lombardo',
							'cyp2d6_veith',
							'cyp3a4_veith',
							'cyp2c9_veith',
							'cyp2d6_substrate_carbonmangels',
							'cyp3a4_substrate_carbonmangels',
							'cyp2c9_substrate_carbonmangels',
							'half_life_obach',
							'clearance_microsome_az',
							'clearance_hepatocyte_az'],
					'Tox':['herg',
							'ames',
							'dili',
							'ld50_zhu']
					}

drugsyn_benchmark = {'Synergy': ['drugcomb_css',
                                 'drugcomb_hsa',
                                 'drugcomb_loewe',
                                 'drugcomb_bliss',
                                 'drugcomb_zip'
                                 ]}

dti_dg_benchmark = {'DTI': ['bindingdb_patent']} 

docking_benchmark = {'Targets': ['1iep', '2rgp', '3eml', '3ny8', '4rlu', '4unn', '5mo4', '7l11', '3pbl',]}

docking_target_info = {'3pbl': {'center': (9, 22.5, 26), 'size': (15, 15, 15)}, 
					   '1iep': {'center': (15.61389189189189, 53.38013513513513, 15.454837837837842), 'size': (15, 15, 15)},
					   '2rgp': {'center': (16.292121212121213, 34.87081818181819, 92.0353030303030), 'size': (15, 15, 15)},
					   '3eml': {'center': (-9.063639999999998, -7.1446, 55.86259999999999), 'size': (15, 15, 15)},
					   '3ny8': {'center': (2.2488, 4.68495, 51.39820000000001), 'size': (15, 15, 15)},
					   '4rlu': {'center': (-0.7359999999999999, 22.75547368421052, -31.2368947368421), 'size': (15, 15, 15)},
					   '4unn': {'center': (5.684346153846153, 18.191769230769232, -7.37157692307692), 'size': (15, 15, 15)},
					   '5mo4': {'center': (-44.901709677419355, 20.490354838709674, 8.483354838709678), 'size': (15, 15, 15)},
					   '7l11': {'center': (-21.814812500000006, -4.216062499999999, -27.983781250000), 'size': (15, 15, 15)}
						}

####################################

#### Benchmark Metrics
admet_metrics = {'caco2_wang': 'mae',
				'hia_hou': 'roc-auc',
				'pgp_broccatelli': 'roc-auc', 
				'bioavailability_ma': 'roc-auc',
				'lipophilicity_astrazeneca': 'mae',
				'solubility_aqsoldb': 'mae',
				'bbb_martins': 'roc-auc',
				'ppbr_az': 'mae',
				'vdss_lombardo': 'spearman',
				'cyp2c9_veith': 'pr-auc',
				'cyp2d6_veith': 'pr-auc',
				'cyp3a4_veith': 'pr-auc',
				'cyp2c9_substrate_carbonmangels': 'pr-auc',
				'cyp3a4_substrate_carbonmangels': 'roc-auc',
				'cyp2d6_substrate_carbonmangels': 'pr-auc',
				'half_life_obach': 'spearman',
				'clearance_hepatocyte_az': 'spearman',
				'clearance_microsome_az': 'spearman',
				'ld50_zhu': 'mae',
				'herg': 'roc-auc',
				'ames': 'roc-auc',
				'dili': 'roc-auc'
				}

drugsyn_metrics = {'drugcomb_css': 'mae',
                   'drugcomb_hsa':'mae',
                  'drugcomb_loewe':'mae',
                  'drugcomb_bliss':'mae',
                  'drugcomb_zip':'mae',
                  'drugcomb_css_brain':'mae',
                  'drugcomb_css_ovary':'mae',
                  'drugcomb_css_lung':'mae',
                  'drugcomb_css_skin':'mae',
                  'drugcomb_css_hematopoietic_lymphoid':'mae',
                  'drugcomb_css_breast':'mae',
                  'drugcomb_css_prostate':'mae',
                  'drugcomb_css_kidney':'mae',
                  'drugcomb_css_colon':'mae',
                   }

dti_dg_metrics = {'bindingdb_patent': 'pcc'}

#### Benchmark Splits
admet_splits = {'caco2_wang': 'scaffold',
				'hia_hou': 'scaffold',
				'pgp_broccatelli': 'scaffold', 
				'bioavailability_ma': 'scaffold',
				'lipophilicity_astrazeneca': 'scaffold',
				'solubility_aqsoldb': 'scaffold',
				'bbb_martins': 'scaffold',
				'ppbr_az': 'scaffold',
				'vdss_lombardo': 'scaffold',
				'cyp2c9_veith': 'scaffold',
				'cyp2d6_veith': 'scaffold',
				'cyp3a4_veith': 'scaffold',
				'cyp2c9_substrate_carbonmangels': 'scaffold',
				'cyp3a4_substrate_carbonmangels': 'scaffold',
				'cyp2d6_substrate_carbonmangels': 'scaffold',
				'half_life_obach': 'scaffold',
				'clearance_hepatocyte_az': 'scaffold',
				'clearance_microsome_az': 'scaffold',
				'ld50_zhu': 'scaffold',
				'herg': 'scaffold',
				'ames': 'scaffold',
				'dili': 'scaffold'
				}

drugsyn_splits = {'drugcomb_css': 'combination',
                    'drugcomb_hsa': 'combination',
                    'drugcomb_loewe': 'combination',
                    'drugcomb_bliss': 'combination',
                    'drugcomb_zip': 'combination'
                    }

dti_dg_splits = {'bindingdb_patent': 'group'}

####################################

# evaluator for single molecule, the input of __call__ is a single smiles OR list of smiles
download_oracle_names = ['drd2', 'gsk3b', 'jnk3', 'fpscores', 'cyp3a4_veith']
trivial_oracle_names = ['qed', 'logp', 'sa'] + guacamol_oracle
synthetic_oracle_name = ['askcos', 'ibm_rxn']
download_receptor_oracle_name = ['1iep_docking', '2rgp_docking', '3eml_docking', '3ny8_docking', '4rlu_docking',
								 '4unn_docking', '5mo4_docking', '7l11_docking', 'drd3_docking', '3pbl_docking',
								 '1iep_docking_normalize', '2rgp_docking_normalize', '3eml_docking_normalize', '3ny8_docking_normalize', '4rlu_docking_normalize',
								 '4unn_docking_normalize', '5mo4_docking_normalize', '7l11_docking_normalize', 'drd3_docking_normalize', '3pbl_docking_normalize',
								 '1iep_docking_vina', '2rgp_docking_vina', '3eml_docking_vina', '3ny8_docking_vina', '4rlu_docking_vina',
								 '4unn_docking_vina', '5mo4_docking_vina', '7l11_docking_vina', 'drd3_docking_vina', '3pbl_docking_vina', ]



meta_oracle_name = ['isomer_meta', 'rediscovery_meta', 'similarity_meta', 'median_meta', 'docking_score', 'molecule_one_synthesis', 'pyscreener']

oracle_names = download_oracle_names + trivial_oracle_names + distribution_oracles + synthetic_oracle_name + meta_oracle_name + download_receptor_oracle_name 

molgenpaired_dataset_names = ['qed', 'drd2', 'logp']

generation_datasets = retrosyn_dataset_names + forwardsyn_dataset_names + molgenpaired_dataset_names + multiple_molecule_dataset_names
# generation
####################################

category_names = {'single_pred': ["Tox",
									"ADME",
									"HTS",
									"Epitope",
									"Develop",
									"QM",
									"Paratope",
									"Yields",
									"CRISPROutcome"],
				'multi_pred': ["DTI",
								"PPI",
								"DDI",
								"PeptideMHC",
								"DrugRes",
								"AntibodyAff",
								"DrugSyn",
								"MTI",
								"GDA",
								"Catalyst",
								"TCR_Epitope_Binding"],
				'generation': ["RetroSyn",
								"Reaction",
								"MolGen"
								]
				}

[docs]def get_task2category(): task2category = {} for i, j in category_names.items(): for x in j: task2category[x] = i return task2category
dataset_names = {"Tox": toxicity_dataset_names, "ADME": adme_dataset_names, "HTS": hts_dataset_names, "DTI": dti_dataset_names, "PPI": ppi_dataset_names, "DDI": ddi_dataset_names, "RetroSyn": retrosyn_dataset_names, "Reaction": forwardsyn_dataset_names, "MolGen": single_molecule_dataset_names, "LigandMolGen": multiple_molecule_dataset_names, "PeptideMHC": peptidemhc_dataset_names, "Epitope": epitope_dataset_names, "Develop": develop_dataset_names, "DrugRes": drugres_dataset_names, "QM": qm_dataset_names, "AntibodyAff": antibodyaff_dataset_names, "DrugSyn": drugsyn_dataset_names, "MTI": mti_dataset_names, "GDA": gda_dataset_names, "Paratope": paratope_dataset_names, "Yields": yield_dataset_names, "Catalyst": catalyst_dataset_names, "CRISPROutcome": crisproutcome_dataset_names, "test_single_pred": test_single_pred_dataset_names, "test_multi_pred": test_multi_pred_dataset_names, "TCREpitopeBinding": tcr_epi_dataset_names } benchmark_names = {"admet_group": admet_benchmark, "drugcombo_group": drugsyn_benchmark, "docking_group": docking_benchmark, "dti_dg_group": dti_dg_benchmark} bm_metric_names = {"admet_group": admet_metrics, "drugcombo_group": drugsyn_metrics, "dti_dg_group": dti_dg_metrics} bm_split_names = {"admet_group": admet_splits, "drugcombo_group": drugsyn_splits, "dti_dg_group": dti_dg_splits} dataset_list = [] for i in dataset_names.keys(): dataset_list = dataset_list + [i.lower() for i in dataset_names[i]] name2type = {'toxcast': 'tab', 'tox21': 'tab', 'clintox': 'tab', 'lipophilicity_astrazeneca': 'tab', 'solubility_aqsoldb': 'tab', 'hydrationfreeenergy_freesolv': 'tab', 'caco2_wang': 'tab', 'hia_hou': 'tab', 'pgp_broccatelli': 'tab', 'f20_edrug3d': 'tab', 'f30_edrug3d': 'tab', 'bioavailability_ma': 'tab', 'vd_edrug3d': 'tab', 'cyp2c19_veith': 'tab', 'cyp2d6_veith': 'tab', 'cyp3a4_veith': 'tab', 'cyp1a2_veith': 'tab', 'cyp2c9_veith': 'tab', 'cyp2c9_substrate_carbonmangels': 'tab', 'cyp2d6_substrate_carbonmangels': 'tab', 'cyp3a4_substrate_carbonmangels': 'tab', 'carcinogens_lagunin': 'tab', 'halflife_edrug3d': 'tab', 'clearance_edrug3d': 'tab', 'bbb_adenot': 'tab', 'bbb_martins': 'tab', 'ppbr_ma': 'tab', 'ppbr_edrug3d': 'tab', 'hiv': 'tab', 'sarscov2_3clpro_diamond': 'tab', 'sarscov2_vitro_touret': 'tab', 'davis': 'tab', 'kiba': 'tab', 'bindingdb_kd': 'tab', 'bindingdb_ic50': 'csv', 'bindingdb_ki': 'csv', 'bindingdb_patent': 'csv', 'huri': 'tab', 'drugbank': 'tab', 'twosides': 'csv', 'mhc1_iedb-imgt_nielsen': 'tab', 'mhc2_iedb_jensen': 'tab', 'uspto': 'csv', 'uspto50k': 'tab', 'zinc': 'tab', 'moses': 'tab', 'chembl': 'tab', 'chembl_v29': 'csv', 'qed': 'tab', 'drd2': 'tab', 'logp': 'tab', 'drugcomb':'pkl', 'gdsc1': 'pkl', 'gdsc2': 'pkl', 'iedb_jespersen': 'pkl', 'pdb_jespersen': 'pkl', 'qm7': 'pkl', 'qm7b': 'pkl', 'qm8': 'pkl', 'qm9': 'pkl', 'scpdb': 'pkl', 'tap': 'tab', 'sabdab_chen': 'tab', 'protein_sabdab': 'csv', 'oncopolypharmacology': 'pkl', 'mirtarbase': 'csv', 'disgenet': 'csv', 'sabdab_liberis': 'pkl', 'uspto_yields': 'pkl', 'uspto_catalyst': 'csv', 'buchwald-hartwig': 'pkl', 'hetionet':'tab', 'herg': 'tab', 'herg_central': 'tab', 'dili': 'tab', 'ppbr_az': 'tab', 'ames': 'tab', 'skin_reaction': 'tab', 'drugbank_drugs': 'csv', 'clearance_microsome_az': 'tab', 'clearance_hepatocyte_az': 'tab', 'half_life_obach': 'tab', 'ld50_zhu': 'tab', 'vdss_lombardo': 'tab', 'leenay':'tab', 'test_single_pred': 'tab', 'test_multi_pred': 'tab', 'gdsc_gene_symbols': 'tab', 'weber': 'tab', 'primekg': 'tab', 'primekg_drug_feature': 'tab', 'primekg_disease_feature': 'tab'} name2id = {'bbb_adenot': 4259565, 'bbb_martins': 4259566, 'bindingdb_ic50': 4291560, 'bindingdb_kd': 4291555, 'bindingdb_ki': 4291556, 'bindingdb_patent': 4724851, 'bioavailability_ma': 4259567, 'caco2_wang': 4259569, 'clearance_edrug3d': 4259571, 'clintox': 4259572, 'cyp1a2_veith': 4259573, 'cyp2c19_veith': 4259576, 'cyp2c9_veith': 4259577, 'cyp2d6_veith': 4259580, 'cyp3a4_veith': 4259582, 'cyp2c9_substrate_carbonmangels': 4259584, 'cyp2d6_substrate_carbonmangels': 4259578, 'cyp3a4_substrate_carbonmangels': 4259581, 'carcinogens_lagunin': 4259570, 'davis': 5219748, 'drugbank': 4139573, 'drugcomb': 4215720, 'f20_edrug3d': 4259586, 'f30_edrug3d': 4259589, 'halflife_edrug3d': 4259587, 'hia_hou': 4259591, 'hiv': 4259593, 'huri': 4139567, 'hydrationfreeenergy_freesolv': 4259594, 'kiba': 5255037, 'lipophilicity_astrazeneca': 4259595, 'pgp_broccatelli': 4259597, 'ppbr_edrug3d': 4259600, 'ppbr_ma': 4259603, 'sarscov2_3clpro_diamond': 4259606, 'sarscov2_vitro_touret': 4259607, 'solubility_aqsoldb': 4259610, 'tox21': 4259612, 'toxcast': 4259613, 'twosides': 4139574, 'vd_edrug3d': 4259618, 'mhc1_iedb-imgt_nielsen': 4167073, 'mhc2_iedb_jensen': 4167074, 'zinc': 4170963, 'moses': 4170962, 'chembl': 4170965, 'chembl_v29': 5767979, 'qed': 4170959, 'drd2': 4170957, 'logp': 4170961, 'gdsc1': 4165726, 'gdsc2': 4165727, 'iedb_jespersen': 4165725, 'pdb_jespersen': 4165724, 'qm7': 6358510, 'qm7b': 6358512, 'qm8': 6358513, 'qm9': 6179310, ### 4167112, 6175612 'scpdb': 6190916, 'tap': 4167113, 'sabdab_chen': 4167164, 'protein_sabdab': 4167357, 'oncopolypharmacology': 4167358, 'mirtarbase': 4167359, 'disgenet': 4168282, 'sabdab_liberis': 4168425, 'uspto50k': 4171823, 'buchwald-hartwig': 6175640, 'uspto_yields': 4186956, 'uspto_catalyst': 4171574, 'uspto': 4171642, 'hetionet': 4201734, 'herg': 4259588, 'herg_central': 5740618, 'dili': 4259585, 'ppbr_az': 6413140, 'ames': 4259564, 'skin_reaction': 4259609, 'clearance_microsome_az': 4266186, 'clearance_hepatocyte_az': 4266187, 'ld50_zhu': 4267146, 'half_life_obach': 4266799, 'vdss_lombardo': 4267387, 'leenay':4279966, 'test_single_pred': 4832455, 'test_multi_pred': 4832456, 'gdsc_gene_symbols': 5255026, 'weber': 5790963, 'primekg': 6180626, 'primekg_drug_feature': 6180619, 'primekg_disease_feature': 6180618} oracle2type = {'drd2': 'pkl', 'jnk3': 'pkl', 'gsk3b': 'pkl', 'fpscores': 'pkl', 'cyp3a4_veith': 'pkl', } oracle2id = {'drd2': 4178625, 'gsk3b': 4170295, 'jnk3': 4170293, 'fpscores': 4170416, 'cyp3a4_veith': 4411249, } benchmark2type = {'admet_group': 'zip', 'drugcombo_group': 'zip', 'docking_group': 'zip', 'dti_dg_group': 'zip'} benchmark2id = {'admet_group': 4426004, 'drugcombo_group': 4426002, 'docking_group': 4554082, 'dti_dg_group': 4742443} receptor2id = { '1iep': [5137914, 5617659], '2rgp': [5137916, 5617662], '3eml': [5137919, 5617663], '3ny8': [5137915, 5617665], '4rlu': [5137918, 5617658], '4unn': [5137917, 5617661], '5mo4': [5137920, 5617664], '7l11': [5137921, 5617660], '3pbl': [5257195, 5617666], } ## 'drd3': 5137901, sdf_file_names = { 'grambow': ['Product', 'Reactant', 'TS'] } name2stats = { 'caco2_wang': 906, 'hia_hou': 578, 'pgp_broccatelli': 1212, 'bioavailability_ma': 640, 'lipophilicity_astrazeneca': 4200, 'solubility_aqsoldb': 9982, 'bbb_martins': 1975, 'ppbr_az': 1797, 'vdss_lombardo': 1130, 'cyp2c19_veith': 12092, 'cyp2d6_veith': 13130, 'cyp3a4_veith': 12328, 'cyp1a2_veith': 12579, 'cyp2c9_veith': 12092, 'cyp2c9_substrate_carbonmangels': 666, 'cyp2d6_substrate_carbonmangels': 664, 'cyp3a4_substrate_carbonmangels': 667, 'half_life_obach': 667, 'clearance_hepatocyte_az': 1020, 'clearance_microsome_az': 1102, 'ld50_zhu': 7385, 'herg': 648, 'ames': 7255, 'dili': 475, 'skin_reaction': 404, 'carcinogens_lagunin':278, 'tox21': 7831, 'clintox': 1484, 'sarscov2_vitro_touret': 1480, 'sarscov2_3clpro_diamond': 879, 'hiv': 41127, 'qm7': 7165, 'qm7b': 7211, 'qm8': 21747, 'qm9': 133885, 'uspto_yields': 853638, 'buchwald-hartwig': 55370, 'sabdab_liberis': 1023, 'iedb_jespersen': 3159, 'pdb_jespersen': 447, 'tap': 242, 'sabdab_chen': 2409, 'leenay': 1521, 'bindingdb_kd': 52284, 'bindingdb_ki': 375032, 'bindingdb_ic50': 991486, 'bindingdb_patent': 243344, 'davis': 27621, 'kiba': 118036, 'drugbank': 191808, 'twosides': 4649441, 'huri': 51813, 'disgenet': 52476, 'gdsc1': 177310, 'gdsc2': 92703, 'drugcomb': 297098, 'oncopolypharmacology': 23052, 'mhc1_iedb-imgt_nielsen': 185985, 'mhc2_iedb_jensen': 134281, 'protein_sabdab': 493, 'mirtarbase': 400082, 'uspto_catalyst': 721799, 'moses': 1936962, 'zinc': 249455, 'chembl': 1961462, 'uspto50k': 50036, 'uspto': 1939253 }