Source code for tdc.utils.query

"""Utilities functions for query  
"""
import json
import os, sys

try:
    from urllib.error import HTTPError
    from urllib.parse import quote, urlencode
    from urllib.request import urlopen
except ImportError:
    from urllib import urlencode
    from urllib2 import quote, urlopen, HTTPError


def _parse_prop(search, proplist):
    """Extract property value from record using the given urn search filter."""
    props = [
        i for i in proplist if all(item in i["urn"].items() for item in search.items())
    ]
    if len(props) > 0:
        return props[0]["value"][list(props[0]["value"].keys())[0]]


[docs]def request( identifier, namespace="cid", domain="compound", operation=None, output="JSON", searchtype=None, ): """ copied from https://github.com/mcs07/PubChemPy/blob/e3c4f4a9b6120433e5cc3383464c7a79e9b2b86e/pubchempy.py#L238 Construct API request from parameters and return the response. Full specification at http://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html """ API_BASE = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" text_types = str, bytes if not identifier: raise ValueError("identifier/cid cannot be None") # If identifier is a list, join with commas into string if isinstance(identifier, int): identifier = str(identifier) if not isinstance(identifier, text_types): identifier = ",".join(str(x) for x in identifier) # Build API URL urlid, postdata = None, None if namespace == "sourceid": identifier = identifier.replace("/", ".") if ( namespace in ["listkey", "formula", "sourceid"] or searchtype == "xref" or (searchtype and namespace == "cid") or domain == "sources" ): urlid = quote(identifier.encode("utf8")) else: postdata = urlencode([(namespace, identifier)]).encode("utf8") comps = filter( None, [API_BASE, domain, searchtype, namespace, urlid, operation, output] ) apiurl = "/".join(comps) # Make request response = urlopen(apiurl, postdata) return response
[docs]def uniprot2seq(ProteinID): """Get protein sequence from Uniprot ID Args: ProteinID (str): the uniprot ID Returns: str: amino acid sequence """ import urllib import string import urllib.request as ur ID = str(ProteinID) localfile = ur.urlopen("http://www.uniprot.org/uniprot/" + ID + ".fasta") temp = localfile.readlines() res = "" for i in range(1, len(temp)): res = res + temp[i].strip().decode("utf-8") return res
[docs]def cid2smiles(cid): """SMILES string from PubChem CID Args: cid (str): PubChem CID Returns: str: SMILES string """ try: smiles = _parse_prop( {"label": "SMILES", "name": "Canonical"}, json.loads(request(cid).read().decode())["PC_Compounds"][0]["props"], ) except: print("cid " + str(cid) + " failed, use NULL string") smiles = "NULL" return smiles