# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT
from packaging import version
import pkg_resources
"""This file contains all metadata of datasets in TDC.
Attributes:
adme_dataset_names (list): all adme dataset names
admet_benchmark (dict): a dictionary with key the TDC task and value a list of dataset names
admet_metrics (dict): a dictionary with key the dataset name and value the recommended metric
admet_splits (dict): a dictionary with key the dataset name and value the recommended split
antibodyaff_dataset_names (list): all antibody_aff dataset names
benchmark2id (dict): benchmark names to dataverse download ID
benchmark2type (dict): benchmark names to file type in download format
benchmark_names (dict): a dictionary mapping benchmark group name to each benchmark group dataset names
bm_metric_names (dict): a dictionary mapping benchmark group name to each benchmark group metric names
bm_split_names (dict): a dictionary mapping benchmark group name to each benchmark group split names
catalyst_dataset_names (list): all catalyst dataset names
category_names (dict): mapping from ML problem (1st tier) to all tasks
crisproutcome_dataset_names (list): all crispr outcome dataset names
dataset_list (list): total list of dataset names in TDC
dataset_names (dict): mapping from task name to list of dataset names
ddi_dataset_names (list): all ddi dataset names
develop_dataset_names (list): all develop dataset names
distribution_oracles (list): all distribution learning oracles, i.e. molecule evaluators
docking_oracles (list): all docking oracles, i.e. RMSD
docking_benchmark (dict): docking benchmark target names
docking_target_info (dict): docking benchmark target pockets info
download_oracle_names (list): oracle names that require downloading predictors
drugres_dataset_names (list): all drugres dataset names
drugsyn_benchmark (dict): drugcombo benchmark group targets
drugsyn_dataset_names (list): all drugsyn dataset names
drugsyn_metrics (dict): a dictionary with key the dataset name and value the recommended metric
drugsyn_splits (dict): a dictionary with key the dataset name and value the recommended split
dti_dataset_names (list): all dti dataset names
dti_dg_benchmark (dict): dti_dg benchmark group dataset names
dti_dg_metrics (dict): a dictionary with key the dataset name and value the recommended metric
dti_dg_splits (dict): a dictionary with key the dataset name and value the recommended split
epitope_dataset_names (list): all epitope dataset names
evaluator_name (list): list of evaluator names
forwardsyn_dataset_names (list): all reaction dataset names
gda_dataset_names (list): all gda dataset names
generation_datasets (list): all generation dataset names
guacamol_oracle (list): list of oracles from guacamol
hts_dataset_names (list): all hts dataset names
meta_oracle_name (list): list of all meta oracle names
molgenpaired_dataset_names (list): all molgenpaired dataset names
mti_dataset_names (list): all mti dataset names
name2id (dict): mapping from dataset names to dataverse id
name2stats (dict): mapping from dataset names to statistics
name2type (dict): mapping from dataset names to downloaded file format
oracle2id (dict): mapping from oracle names to dataverse id
oracle2type (dict): mapping from oracle names to downloaded file format
receptor2id (dict): mapping from receptor id to dataverse id
oracle_names (list): list of all oracle names
paired_dataset_names (list): all paired dataset names
paratope_dataset_names (list): all paratope dataset names
peptidemhc_dataset_names (list): all peptidemhc dataset names
ppi_dataset_names (list): all ppi dataset names
property_names (list): a list of oracles that correspond to some molecular properties
qm_dataset_names (list): all qm dataset names
retrosyn_dataset_names (list): all retrosyn dataset names
sdf_file_names (list): list of sdf file names
single_molecule_dataset_names (list): all molgen dataset names
multiple_molecule_dataset_names (list): all ligandmolgen dataset names
synthetic_oracle_name (list): all oracle names for synthesis
test_multi_pred_dataset_names (list): test multi pred task name
test_single_pred_dataset_names (list): test single pred task name
toxicity_dataset_names (list): all toxicity dataset names
trivial_oracle_names (list): a list of oracle names for trivial oracles
yield_dataset_names (list): all yield dataset names
"""
####################################
# test cases
test_single_pred_dataset_names = ["test_single_pred"]
test_multi_pred_dataset_names = ["test_multi_pred"]
# single_pred prediction
toxicity_dataset_names = [
"tox21",
"toxcast",
"clintox",
"herg_karim",
"herg",
"herg_central",
"dili",
"skin_reaction",
"ames",
"carcinogens_lagunin",
"ld50_zhu",
]
adme_dataset_names = [
"lipophilicity_astrazeneca",
"solubility_aqsoldb",
"hydrationfreeenergy_freesolv",
"caco2_wang",
"pampa_ncats",
"approved_pampa_ncats",
"hia_hou",
"pgp_broccatelli",
"bioavailability_ma",
"vdss_lombardo",
"cyp2c19_veith",
"cyp2d6_veith",
"cyp3a4_veith",
"cyp1a2_veith",
"cyp2c9_veith",
"cyp2c9_substrate_carbonmangels",
"cyp2d6_substrate_carbonmangels",
"cyp3a4_substrate_carbonmangels",
"bbb_martins",
"ppbr_az",
"half_life_obach",
"clearance_hepatocyte_az",
"clearance_microsome_az",
]
hts_dataset_names = ["hiv",
"sarscov2_3clpro_diamond",
"sarscov2_vitro_touret",
"orexin1_receptor_butkiewicz",
"m1_muscarinic_receptor_agonists_butkiewicz",
"m1_muscarinic_receptor_antagonists_butkiewicz",
"potassium_ion_channel_kir2.1_butkiewicz",
"kcnq2_potassium_channel_butkiewicz",
"cav3_t-type_calcium_channels_butkiewicz",
"choline_transporter_butkiewicz",
"serine_threonine_kinase_33_butkiewicz",
"tyrosyl-dna_phosphodiesterase_butkiewicz"]
qm_dataset_names = ["qm7", "qm7b", "qm8", "qm9"]
epitope_dataset_names = ["iedb_jespersen", "pdb_jespersen"]
paratope_dataset_names = ["sabdab_liberis"]
develop_dataset_names = ["tap", "sabdab_chen"]
# multi_pred prediction
dti_dataset_names = [
"davis",
"kiba",
"bindingdb_kd",
"bindingdb_ic50",
"bindingdb_ki",
"bindingdb_patent",
]
ppi_dataset_names = ["huri"]
peptidemhc_dataset_names = ["mhc2_iedb_jensen", "mhc1_iedb-imgt_nielsen"]
ddi_dataset_names = ["drugbank", "twosides"]
mti_dataset_names = ["mirtarbase"]
gda_dataset_names = ["disgenet"]
crisproutcome_dataset_names = ["leenay"]
drugres_dataset_names = ["gdsc1", "gdsc2"]
drugsyn_dataset_names = ["oncopolypharmacology", "drugcomb"]
antibodyaff_dataset_names = ["protein_sabdab"]
yield_dataset_names = ["uspto_yields", "buchwald-hartwig"]
catalyst_dataset_names = ["uspto_catalyst"]
tcr_epi_dataset_names = ["weber"]
####################################
# generation
retrosyn_dataset_names = ["uspto50k", "uspto"]
forwardsyn_dataset_names = ["uspto"]
single_molecule_dataset_names = ["zinc", "moses", "chembl", "chembl_v29"]
multiple_molecule_dataset_names = ["dude", "pdbbind", "scpdb"] #'crossdock',
paired_dataset_names = ["uspto50k", "uspto"]
####################################
# resource
compound_library_names = [
"drugbank_drugs",
"chembl_drugs",
"broad_repurposing_hub",
"antivirals",
]
biokg_library_names = ["hetionet"]
####################################
# oracles
#### evaluator for distribution learning, the input of __call__ is list of smiles
distribution_oracles = [
"novelty",
"diversity",
"uniqueness",
"validity",
"fcd_distance",
"kl_divergence",
]
docking_oracles = ["rmsd", "kabsch_rmsd", "smina"]
property_names = [
"drd2",
"qed",
"logp",
"sa",
"gsk3b",
"jnk3",
]
evaluator_name = [
"roc-auc",
"f1",
"pr-auc",
"precision",
"recall",
"accuracy",
"mse",
"rmse",
"mae",
"r2",
"micro-f1",
"macro-f1",
"kappa",
"avg-roc-auc",
"rp@k",
"pr@k",
"pcc",
"spearman",
"range_logAUC",
]
evaluator_name.extend(distribution_oracles)
evaluator_name.extend(docking_oracles)
guacamol_oracle = [
"rediscovery",
"similarity",
"median",
"isomers",
"mpo",
"hop",
"celecoxib_rediscovery",
"troglitazone_rediscovery",
"thiothixene_rediscovery",
"aripiprazole_similarity",
"albuterol_similarity",
"mestranol_similarity",
"isomers_c7h8n2o2",
"isomers_c9h10n2o2pf2cl",
"isomers_c11h24",
"osimertinib_mpo",
"fexofenadine_mpo",
"ranolazine_mpo",
"perindopril_mpo",
"amlodipine_mpo",
"sitagliptin_mpo",
"zaleplon_mpo",
"sitagliptin_mpo_prev",
"zaleplon_mpo_prev",
"median1",
"median2",
"valsartan_smarts",
"deco_hop",
"scaffold_hop",
]
####################################
# Benchmark Datasets
admet_benchmark = {
"ADME": [
"caco2_wang",
"hia_hou",
"pgp_broccatelli",
"bioavailability_ma",
"lipophilicity_astrazeneca",
"solubility_aqsoldb",
"bbb_martins",
"ppbr_az",
"vdss_lombardo",
"cyp2d6_veith",
"cyp3a4_veith",
"cyp2c9_veith",
"cyp2d6_substrate_carbonmangels",
"cyp3a4_substrate_carbonmangels",
"cyp2c9_substrate_carbonmangels",
"half_life_obach",
"clearance_microsome_az",
"clearance_hepatocyte_az",
],
"Tox": ["herg", "ames", "dili", "ld50_zhu"],
}
drugsyn_benchmark = {
"Synergy": [
"drugcomb_css",
"drugcomb_hsa",
"drugcomb_loewe",
"drugcomb_bliss",
"drugcomb_zip",
]
}
dti_dg_benchmark = {"DTI": ["bindingdb_patent"]}
docking_benchmark = {
"Targets": [
"1iep",
"2rgp",
"3eml",
"3ny8",
"4rlu",
"4unn",
"5mo4",
"7l11",
"3pbl",
]
}
docking_target_info = {
"3pbl": {"center": (9, 22.5, 26), "size": (15, 15, 15)},
"1iep": {
"center": (15.61389189189189, 53.38013513513513, 15.454837837837842),
"size": (15, 15, 15),
},
"2rgp": {
"center": (16.292121212121213, 34.87081818181819, 92.0353030303030),
"size": (15, 15, 15),
},
"3eml": {
"center": (-9.063639999999998, -7.1446, 55.86259999999999),
"size": (15, 15, 15),
},
"3ny8": {"center": (2.2488, 4.68495, 51.39820000000001), "size": (15, 15, 15)},
"4rlu": {
"center": (-0.7359999999999999, 22.75547368421052, -31.2368947368421),
"size": (15, 15, 15),
},
"4unn": {
"center": (5.684346153846153, 18.191769230769232, -7.37157692307692),
"size": (15, 15, 15),
},
"5mo4": {
"center": (-44.901709677419355, 20.490354838709674, 8.483354838709678),
"size": (15, 15, 15),
},
"7l11": {
"center": (-21.814812500000006, -4.216062499999999, -27.983781250000),
"size": (15, 15, 15),
},
}
####################################
#### Benchmark Metrics
admet_metrics = {
"caco2_wang": "mae",
"hia_hou": "roc-auc",
"pgp_broccatelli": "roc-auc",
"bioavailability_ma": "roc-auc",
"lipophilicity_astrazeneca": "mae",
"solubility_aqsoldb": "mae",
"bbb_martins": "roc-auc",
"ppbr_az": "mae",
"vdss_lombardo": "spearman",
"cyp2c9_veith": "pr-auc",
"cyp2d6_veith": "pr-auc",
"cyp3a4_veith": "pr-auc",
"cyp2c9_substrate_carbonmangels": "pr-auc",
"cyp3a4_substrate_carbonmangels": "roc-auc",
"cyp2d6_substrate_carbonmangels": "pr-auc",
"half_life_obach": "spearman",
"clearance_hepatocyte_az": "spearman",
"clearance_microsome_az": "spearman",
"ld50_zhu": "mae",
"herg": "roc-auc",
"ames": "roc-auc",
"dili": "roc-auc",
}
drugsyn_metrics = {
"drugcomb_css": "mae",
"drugcomb_hsa": "mae",
"drugcomb_loewe": "mae",
"drugcomb_bliss": "mae",
"drugcomb_zip": "mae",
"drugcomb_css_brain": "mae",
"drugcomb_css_ovary": "mae",
"drugcomb_css_lung": "mae",
"drugcomb_css_skin": "mae",
"drugcomb_css_hematopoietic_lymphoid": "mae",
"drugcomb_css_breast": "mae",
"drugcomb_css_prostate": "mae",
"drugcomb_css_kidney": "mae",
"drugcomb_css_colon": "mae",
}
dti_dg_metrics = {"bindingdb_patent": "pcc"}
#### Benchmark Splits
admet_splits = {
"caco2_wang": "scaffold",
"hia_hou": "scaffold",
"pgp_broccatelli": "scaffold",
"bioavailability_ma": "scaffold",
"lipophilicity_astrazeneca": "scaffold",
"solubility_aqsoldb": "scaffold",
"bbb_martins": "scaffold",
"ppbr_az": "scaffold",
"vdss_lombardo": "scaffold",
"cyp2c9_veith": "scaffold",
"cyp2d6_veith": "scaffold",
"cyp3a4_veith": "scaffold",
"cyp2c9_substrate_carbonmangels": "scaffold",
"cyp3a4_substrate_carbonmangels": "scaffold",
"cyp2d6_substrate_carbonmangels": "scaffold",
"half_life_obach": "scaffold",
"clearance_hepatocyte_az": "scaffold",
"clearance_microsome_az": "scaffold",
"ld50_zhu": "scaffold",
"herg": "scaffold",
"ames": "scaffold",
"dili": "scaffold",
}
drugsyn_splits = {
"drugcomb_css": "combination",
"drugcomb_hsa": "combination",
"drugcomb_loewe": "combination",
"drugcomb_bliss": "combination",
"drugcomb_zip": "combination",
}
dti_dg_splits = {"bindingdb_patent": "group"}
####################################
# evaluator for single molecule, the input of __call__ is a single smiles OR list of smiles
download_oracle_names = ["drd2", "gsk3b", "jnk3", "fpscores", "cyp3a4_veith", "smina"]
# download_oracle_names = ['drd2', 'gsk3b', 'jnk3', 'fpscores', 'cyp3a4_veith']
download_oracle_names = ["drd2", "gsk3b", "jnk3", "fpscores", "cyp3a4_veith"] + [
"drd2_current",
"gsk3b_current",
"jnk3_current",
]
trivial_oracle_names = ["qed", "logp", "sa"] + guacamol_oracle
synthetic_oracle_name = ["askcos", "ibm_rxn"]
download_receptor_oracle_name = [
"1iep_docking",
"2rgp_docking",
"3eml_docking",
"3ny8_docking",
"4rlu_docking",
"4unn_docking",
"5mo4_docking",
"7l11_docking",
"drd3_docking",
"3pbl_docking",
"1iep_docking_normalize",
"2rgp_docking_normalize",
"3eml_docking_normalize",
"3ny8_docking_normalize",
"4rlu_docking_normalize",
"4unn_docking_normalize",
"5mo4_docking_normalize",
"7l11_docking_normalize",
"drd3_docking_normalize",
"3pbl_docking_normalize",
"1iep_docking_vina",
"2rgp_docking_vina",
"3eml_docking_vina",
"3ny8_docking_vina",
"4rlu_docking_vina",
"4unn_docking_vina",
"5mo4_docking_vina",
"7l11_docking_vina",
"drd3_docking_vina",
"3pbl_docking_vina",
]
meta_oracle_name = [
"isomer_meta",
"rediscovery_meta",
"similarity_meta",
"median_meta",
"docking_score",
"molecule_one_synthesis",
"pyscreener",
]
oracle_names = (
download_oracle_names
+ trivial_oracle_names
+ distribution_oracles
+ synthetic_oracle_name
+ meta_oracle_name
+ docking_oracles
+ download_receptor_oracle_name
)
molgenpaired_dataset_names = ["qed", "drd2", "logp"]
generation_datasets = (
retrosyn_dataset_names
+ forwardsyn_dataset_names
+ molgenpaired_dataset_names
+ multiple_molecule_dataset_names
)
# generation
####################################
category_names = {
"single_pred": [
"Tox",
"ADME",
"HTS",
"Epitope",
"Develop",
"QM",
"Paratope",
"Yields",
"CRISPROutcome",
],
"multi_pred": [
"DTI",
"PPI",
"DDI",
"PeptideMHC",
"DrugRes",
"AntibodyAff",
"DrugSyn",
"MTI",
"GDA",
"Catalyst",
"TCR_Epitope_Binding",
],
"generation": ["RetroSyn", "Reaction", "MolGen"],
}
[docs]def get_task2category():
task2category = {}
for i, j in category_names.items():
for x in j:
task2category[x] = i
return task2category
dataset_names = {
"Tox": toxicity_dataset_names,
"ADME": adme_dataset_names,
"HTS": hts_dataset_names,
"DTI": dti_dataset_names,
"PPI": ppi_dataset_names,
"DDI": ddi_dataset_names,
"RetroSyn": retrosyn_dataset_names,
"Reaction": forwardsyn_dataset_names,
"MolGen": single_molecule_dataset_names,
"sbdd": multiple_molecule_dataset_names,
"PeptideMHC": peptidemhc_dataset_names,
"Epitope": epitope_dataset_names,
"Develop": develop_dataset_names,
"DrugRes": drugres_dataset_names,
"QM": qm_dataset_names,
"AntibodyAff": antibodyaff_dataset_names,
"DrugSyn": drugsyn_dataset_names,
"MTI": mti_dataset_names,
"GDA": gda_dataset_names,
"Paratope": paratope_dataset_names,
"Yields": yield_dataset_names,
"Catalyst": catalyst_dataset_names,
"CRISPROutcome": crisproutcome_dataset_names,
"test_single_pred": test_single_pred_dataset_names,
"test_multi_pred": test_multi_pred_dataset_names,
"TCREpitopeBinding": tcr_epi_dataset_names,
}
benchmark_names = {
"admet_group": admet_benchmark,
"drugcombo_group": drugsyn_benchmark,
"docking_group": docking_benchmark,
"dti_dg_group": dti_dg_benchmark,
}
bm_metric_names = {
"admet_group": admet_metrics,
"drugcombo_group": drugsyn_metrics,
"dti_dg_group": dti_dg_metrics,
}
bm_split_names = {
"admet_group": admet_splits,
"drugcombo_group": drugsyn_splits,
"dti_dg_group": dti_dg_splits,
}
dataset_list = []
for i in dataset_names.keys():
dataset_list = dataset_list + [i.lower() for i in dataset_names[i]]
name2type = {
"toxcast": "tab",
"tox21": "tab",
"clintox": "tab",
"lipophilicity_astrazeneca": "tab",
"solubility_aqsoldb": "tab",
"hydrationfreeenergy_freesolv": "tab",
"caco2_wang": "tab",
"pampa_ncats": "tab",
"approved_pampa_ncats": "tab",
"hia_hou": "tab",
"pgp_broccatelli": "tab",
"f20_edrug3d": "tab",
"f30_edrug3d": "tab",
"bioavailability_ma": "tab",
"vd_edrug3d": "tab",
"cyp2c19_veith": "tab",
"cyp2d6_veith": "tab",
"cyp3a4_veith": "tab",
"cyp1a2_veith": "tab",
"cyp2c9_veith": "tab",
"cyp2c9_substrate_carbonmangels": "tab",
"cyp2d6_substrate_carbonmangels": "tab",
"cyp3a4_substrate_carbonmangels": "tab",
"carcinogens_lagunin": "tab",
"halflife_edrug3d": "tab",
"clearance_edrug3d": "tab",
"bbb_adenot": "tab",
"bbb_martins": "tab",
"ppbr_ma": "tab",
"ppbr_edrug3d": "tab",
"hiv": "tab",
"sarscov2_3clpro_diamond": "tab",
"sarscov2_vitro_touret": "tab",
"orexin1_receptor_butkiewicz": "tab",
"m1_muscarinic_receptor_agonists_butkiewicz": "tab",
"m1_muscarinic_receptor_antagonists_butkiewicz": "tab",
"potassium_ion_channel_kir2.1_butkiewicz": "tab",
"kcnq2_potassium_channel_butkiewicz": "tab",
"cav3_t-type_calcium_channels_butkiewicz": "tab",
"choline_transporter_butkiewicz": "tab",
"serine_threonine_kinase_33_butkiewicz": "tab",
"tyrosyl-dna_phosphodiesterase_butkiewicz": "tab",
"davis": "tab",
"kiba": "tab",
"bindingdb_kd": "tab",
"bindingdb_ic50": "csv",
"bindingdb_ki": "csv",
"bindingdb_patent": "csv",
"huri": "tab",
"drugbank": "tab",
"twosides": "csv",
"mhc1_iedb-imgt_nielsen": "tab",
"mhc2_iedb_jensen": "tab",
"uspto": "csv",
"uspto50k": "tab",
"zinc": "tab",
"moses": "tab",
"chembl": "tab",
"chembl_v29": "csv",
"qed": "tab",
"drd2": "tab",
"logp": "tab",
"drugcomb": "pkl",
"gdsc1": "pkl",
"gdsc2": "pkl",
"iedb_jespersen": "pkl",
"pdb_jespersen": "pkl",
"qm7": "pkl",
"qm7b": "pkl",
"qm8": "pkl",
"qm9": "pkl",
"scpdb": "zip",
"dude": "zip",
# 'crossdock': 'zip',
"tap": "tab",
"sabdab_chen": "tab",
"protein_sabdab": "csv",
"oncopolypharmacology": "pkl",
"mirtarbase": "csv",
"disgenet": "csv",
"sabdab_liberis": "pkl",
"uspto_yields": "pkl",
"uspto_catalyst": "csv",
"buchwald-hartwig": "pkl",
"hetionet": "tab",
"herg": "tab",
"herg_central": "tab",
"herg_karim": "tab",
"dili": "tab",
"ppbr_az": "tab",
"ames": "tab",
"skin_reaction": "tab",
"drugbank_drugs": "csv",
"clearance_microsome_az": "tab",
"clearance_hepatocyte_az": "tab",
"half_life_obach": "tab",
"ld50_zhu": "tab",
"vdss_lombardo": "tab",
"leenay": "tab",
"test_single_pred": "tab",
"test_multi_pred": "tab",
"gdsc_gene_symbols": "tab",
"weber": "tab",
"primekg": "tab",
"primekg_drug_feature": "tab",
"primekg_disease_feature": "tab",
}
name2id = {
"bbb_adenot": 4259565,
"bbb_martins": 4259566,
"bindingdb_ic50": 4291560,
"bindingdb_kd": 4291555,
"bindingdb_ki": 4291556,
"bindingdb_patent": 4724851,
"bioavailability_ma": 4259567,
"caco2_wang": 4259569,
"pampa_ncats": 6695858,
"approved_pampa_ncats": 6695857,
"clearance_edrug3d": 4259571,
"clintox": 4259572,
"cyp1a2_veith": 4259573,
"cyp2c19_veith": 4259576,
"cyp2c9_veith": 4259577,
"cyp2d6_veith": 4259580,
"cyp3a4_veith": 4259582,
"cyp2c9_substrate_carbonmangels": 4259584,
"cyp2d6_substrate_carbonmangels": 4259578,
"cyp3a4_substrate_carbonmangels": 4259581,
"carcinogens_lagunin": 4259570,
"davis": 5219748,
"drugbank": 4139573,
"drugcomb": 4215720,
"f20_edrug3d": 4259586,
"f30_edrug3d": 4259589,
"halflife_edrug3d": 4259587,
"hia_hou": 4259591,
"hiv": 4259593,
"huri": 4139567,
"hydrationfreeenergy_freesolv": 4259594,
"kiba": 5255037,
"lipophilicity_astrazeneca": 4259595,
"pgp_broccatelli": 4259597,
"ppbr_edrug3d": 4259600,
"ppbr_ma": 4259603,
"sarscov2_3clpro_diamond": 4259606,
"sarscov2_vitro_touret": 4259607,
"orexin1_receptor_butkiewicz": 6894447,
"m1_muscarinic_receptor_agonists_butkiewicz": 6894443,
"m1_muscarinic_receptor_antagonists_butkiewicz": 6894446,
"potassium_ion_channel_kir2.1_butkiewicz": 6894442,
"kcnq2_potassium_channel_butkiewicz": 6894444,
"cav3_t-type_calcium_channels_butkiewicz": 6894445,
"choline_transporter_butkiewicz": 6894441,
"serine_threonine_kinase_33_butkiewicz": 6894448,
"tyrosyl-dna_phosphodiesterase_butkiewicz": 6894440,
"solubility_aqsoldb": 4259610,
"tox21": 4259612,
"toxcast": 4259613,
"twosides": 4139574,
"vd_edrug3d": 4259618,
"mhc1_iedb-imgt_nielsen": 4167073,
"mhc2_iedb_jensen": 4167074,
"zinc": 4170963,
"moses": 4170962,
"chembl": 4170965,
"chembl_v29": 5767979,
"qed": 4170959,
"drd2": 4170957,
"logp": 4170961,
"gdsc1": 4165726,
"gdsc2": 4165727,
"iedb_jespersen": 4165725,
"pdb_jespersen": 4165724,
"qm7": 6358510,
"qm7b": 6358512,
"qm8": 6358513,
"qm9": 6179310, ### 4167112, 6175612
# 'scpdb': None,
# 'dude': None,
# 'crossdock': None,
"tap": 4167113,
"sabdab_chen": 4167164,
"protein_sabdab": 4167357,
"oncopolypharmacology": 4167358,
"mirtarbase": 4167359,
"disgenet": 4168282,
"sabdab_liberis": 4168425,
"uspto50k": 4171823,
"buchwald-hartwig": 6175640,
"uspto_yields": 4186956,
"uspto_catalyst": 4171574,
"uspto": 4171642,
"hetionet": 4201734,
"herg": 4259588,
"herg_central": 5740618,
"herg_karim": 6822246,
"dili": 4259585,
"ppbr_az": 6413140,
"ames": 4259564,
"skin_reaction": 4259609,
"clearance_microsome_az": 4266186,
"clearance_hepatocyte_az": 4266187,
"ld50_zhu": 4267146,
"half_life_obach": 4266799,
"vdss_lombardo": 4267387,
"leenay": 4279966,
"test_single_pred": 4832455,
"test_multi_pred": 4832456,
"gdsc_gene_symbols": 5255026,
"weber": 5790963,
"primekg": 6180626,
"primekg_drug_feature": 6180619,
"primekg_disease_feature": 6180618,
}
oracle2type = {
"drd2": "pkl",
"jnk3": "pkl",
"gsk3b": "pkl",
"fpscores": "pkl",
"cyp3a4_veith": "pkl",
"smina": "static",
"drd2_current": "pkl",
"jnk3_current": "pkl",
"gsk3b_current": "pkl",
}
oracle2id = {
"drd2": 4178625,
"gsk3b": 4170295,
"jnk3": 4170293,
"fpscores": 4170416,
"cyp3a4_veith": 4411249,
"smina": 6361665,
"cyp3a4_veith": 4411249,
"drd2_current": 6413411,
"jnk3_current": 6413420,
"gsk3b_current": 6413412,
}
benchmark2type = {
"admet_group": "zip",
"drugcombo_group": "zip",
"docking_group": "zip",
"dti_dg_group": "zip",
}
benchmark2id = {
"admet_group": 4426004,
"drugcombo_group": 4426002,
"docking_group": 4554082,
"dti_dg_group": 4742443,
}
receptor2id = {
"1iep": [5137914, 5617659],
"2rgp": [5137916, 5617662],
"3eml": [5137919, 5617663],
"3ny8": [5137915, 5617665],
"4rlu": [5137918, 5617658],
"4unn": [5137917, 5617661],
"5mo4": [5137920, 5617664],
"7l11": [5137921, 5617660],
"3pbl": [5257195, 5617666],
} ## 'drd3': 5137901,
sdf_file_names = {"grambow": ["Product", "Reactant", "TS"]}
name2stats = {
"caco2_wang": 906,
"hia_hou": 578,
"pgp_broccatelli": 1212,
"bioavailability_ma": 640,
"lipophilicity_astrazeneca": 4200,
"solubility_aqsoldb": 9982,
"bbb_martins": 1975,
"ppbr_az": 1797,
"vdss_lombardo": 1130,
"cyp2c19_veith": 12092,
"cyp2d6_veith": 13130,
"cyp3a4_veith": 12328,
"cyp1a2_veith": 12579,
"cyp2c9_veith": 12092,
"cyp2c9_substrate_carbonmangels": 666,
"cyp2d6_substrate_carbonmangels": 664,
"cyp3a4_substrate_carbonmangels": 667,
"half_life_obach": 667,
"clearance_hepatocyte_az": 1020,
"clearance_microsome_az": 1102,
"ld50_zhu": 7385,
"herg": 648,
"ames": 7255,
"dili": 475,
"skin_reaction": 404,
"carcinogens_lagunin": 278,
"tox21": 7831,
"clintox": 1484,
"sarscov2_vitro_touret": 1480,
"sarscov2_3clpro_diamond": 879,
"hiv": 41127,
"qm7": 7165,
"qm7b": 7211,
"qm8": 21747,
"qm9": 133885,
"uspto_yields": 853638,
"buchwald-hartwig": 55370,
"sabdab_liberis": 1023,
"iedb_jespersen": 3159,
"pdb_jespersen": 447,
"tap": 242,
"sabdab_chen": 2409,
"leenay": 1521,
"bindingdb_kd": 52284,
"bindingdb_ki": 375032,
"bindingdb_ic50": 991486,
"bindingdb_patent": 243344,
"davis": 27621,
"kiba": 118036,
"drugbank": 191808,
"twosides": 4649441,
"huri": 51813,
"disgenet": 52476,
"gdsc1": 177310,
"gdsc2": 92703,
"drugcomb": 297098,
"oncopolypharmacology": 23052,
"mhc1_iedb-imgt_nielsen": 185985,
"mhc2_iedb_jensen": 134281,
"protein_sabdab": 493,
"mirtarbase": 400082,
"uspto_catalyst": 721799,
"moses": 1936962,
"zinc": 249455,
"chembl": 1961462,
"uspto50k": 50036,
"uspto": 1939253,
}
name2idlist = {
"dude": [6429245, 6429251],
"scpdb": [6431629, 6431631],
}