Source code for tdc.generation.retrosyn

# -*- coding: utf-8 -*-
# Author: TDC Team
# License: MIT

import warnings
warnings.filterwarnings("ignore")

from . import generation_dataset
from ..metadata import dataset_names
from ..utils import create_fold

[docs]class RetroSyn(generation_dataset.PairedDataLoader): """Data loader class accessing to retro-synthetic prediction task. """ def __init__(self, name, path = './data', print_stats = False, input_name = 'product', output_name = 'reactant'): """To create an data loader object for forward reaction prediction task. The goal is to predict the reaction products given a set of reactants Args: name (str): the name of the datset path (str, optional): the path to the saved data file. print_stats (bool, optional): whether to print the basic statistics input_name (str, optional): the name of the column containing input molecular data (product) output_name (str, optional): the name of the column containing output molecular data (reactant) """ super().__init__(name, path, print_stats, input_name, output_name)
[docs] def get_split(self, method = 'random', seed = 42, frac = [0.7, 0.1, 0.2], include_reaction_type = False): '''Return the data splitted as train, valid, test sets. Arguments: method (str): splitting schemes: random, scaffold seed (int): random seed, default 42 frac (list of float): ratio of train/val/test split include_reaction_type (bool): whether or not to include reaction type in the split Returns: pandas DataFrame/dict: a dataframe of the dataset Raises: AttributeError: Use the correct split method as input (random, scaffold) ''' df = self.get_data(format = 'df') if include_reaction_type: from ..utils import get_reaction_type try: rt = get_reaction_type(self.name) df['reaction_type'] = rt except: raise ValueError('Reaction Type Unavailable for ' + str(self.name) + '! Please turn include_reaction_type to be false!') if method == 'random': return create_fold(df, seed, frac) else: raise AttributeError("Please use the correct split method")