Source code for tdc.utils.query

"""Utilities functions for query  
"""
import json
import os, sys
try:
	from urllib.error import HTTPError
	from urllib.parse import quote, urlencode
	from urllib.request import urlopen
except ImportError:
	from urllib import urlencode
	from urllib2 import quote, urlopen, HTTPError

def _parse_prop(search, proplist):
	"""Extract property value from record using the given urn search filter.
	"""
	props = [i for i in proplist if all(item in i['urn'].items() for item in search.items())]
	if len(props) > 0:
		return props[0]['value'][list(props[0]['value'].keys())[0]]

[docs]def request(identifier, namespace='cid', domain='compound', operation=None, output='JSON', searchtype=None): """ copied from https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L238 Construct API request from parameters and return the response. Full specification at http://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html """ API_BASE = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' text_types = str, bytes if not identifier: raise ValueError('identifier/cid cannot be None') # If identifier is a list, join with commas into string if isinstance(identifier, int): identifier = str(identifier) if not isinstance(identifier, text_types): identifier = ','.join(str(x) for x in identifier) # Build API URL urlid, postdata = None, None if namespace == 'sourceid': identifier = identifier.replace('/', '.') if namespace in ['listkey', 'formula', 'sourceid'] \ or searchtype == 'xref' \ or (searchtype and namespace == 'cid') or domain == 'sources': urlid = quote(identifier.encode('utf8')) else: postdata = urlencode([(namespace, identifier)]).encode('utf8') comps = filter(None, [API_BASE, domain, searchtype, namespace, urlid, operation, output]) apiurl = '/'.join(comps) # Make request response = urlopen(apiurl, postdata) return response
[docs]def uniprot2seq(ProteinID): """Get protein sequence from Uniprot ID Args: ProteinID (str): the uniprot ID Returns: str: amino acid sequence """ import urllib import string import urllib.request as ur ID = str(ProteinID) localfile = ur.urlopen('http://www.uniprot.org/uniprot/' + ID + '.fasta') temp = localfile.readlines() res = '' for i in range(1, len(temp)): res = res + temp[i].strip().decode("utf-8") return res
[docs]def cid2smiles(cid): """SMILES string from PubChem CID Args: cid (str): PubChem CID Returns: str: SMILES string """ try: smiles = _parse_prop({'label': 'SMILES', 'name': 'Canonical'}, json.loads(request(cid).read().decode())['PC_Compounds'][0]['props']) except: print('cid ' + str(cid) + ' failed, use NULL string') smiles = 'NULL' return smiles