Function Definition
This function will take a list of SNPs rsIDs (i.e. “rs56116432”) and output some relevant annotations such as gene, consequence, etc. Limited to 200 SNPs per call. Uses the ENSEMBL rest API
import requests, json
import pandas as pd
def quick_vep(list_of_rsid, output = "df", complete = False, assembly = "hg19"):
all_options = ["Blosum62", "CADD", "DisGeNET", "GeneSplicer", "LoF", "Mastermind", "MaxEntScan", "Phenotypes", "SpliceAI", "SpliceRegion", "appris", "canonical", "ccds", "dbscSNV", "domains", "hgvs", "miRNA", "numbers", "protein", "refseq", "uniprot", "variant_class", "vcf_string", "xref_refseq"]
if isinstance(list_of_rsid, str): # IF INPUT IS A SINGLE SNP as A STRING
list_of_rsid = [list_of_rsid]
if isinstance(list_of_rsid, list): # LIMIT TO 200 SNPs
if len(list_of_rsid) > 200:
print("Please limit your requests to 200 variants")
return
if assembly == "hg38":
server = "https://rest.ensembl.org"
elif assembly == "hg19":
server = "https://grch37.rest.ensembl.org"
else:
print("Assembly must be either 'hg38' or 'hg19' ")
return
ext = "/vep/human/id"
headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
# Target Format is '{ "ids" : ["rs56116432", "COSM476" ] }'
#FORMAT THE BODY OF THE MESSAGE
variant_dict = {"ids" : list_of_rsid}
if complete:
for option in all_options:
variant_dict[option] = 1
data = json.dumps(variant_dict)
# POST THE REQUEST
r = requests.post(server+ext, headers=headers, data=data)
if r.status_code == 200: # IF CODE = OK
if r.json() != None: # IF SOMETHING IS ACTUALLY RETURNED
return_dict = r.json()
# PROCESS THE DICTIONNARY SO IT HAS CONSISTENT ENTRIES EVEN IF MISSING
for dictionnary in return_dict:
try:
dictionnary["transcript_consequences"][0]
except KeyError:
dictionnary["transcript_consequences"] = [{}]
try:
dictionnary["regulatory_feature_consequences"][0]
except KeyError:
dictionnary["regulatory_feature_consequences"] = [{}]
#Process Dict
for rsidict in return_dict:
# EXPAND SUB-DICTIONARIES into MAIN DICTIONARIES
for key, value in rsidict["transcript_consequences"][0].items():
rsidict[key] = value
for key, value in rsidict["regulatory_feature_consequences"][0].items():
rsidict[key] = value
# REMOVE THE EXPANDED SECTIONS
rsidict.pop("transcript_consequences", None)
rsidict.pop("regulatory_feature_consequences", None)
rsidict.pop("colocated_variants", None)
rsidict.pop("motif_feature_consequences", None)
# SELECT OUTPUT TYPE
if output == "dict":
return return_dict
elif (output == "df") or (output == "dataframe"):
df = pd.DataFrame.from_dict(return_dict)
df.index = df.id
# REORDER to get info at the beginning
old_columns = list(df.columns)
new_columns = ["seq_region_name", "start", "end", "strand", "allele_string"] + [x for x in old_columns if x not in ["seq_region_name", "start", "end", "strand", "allele_string"]]
df = df[new_columns]
return df
else: # IF RESPONSE IS EMPTY JSON
print("Error: Empty Response, try less SNPs or check if the rsIDs exist")
return
else: # RETURN ERROR CODE
print("Error {} during request".format(r.status_code))
return
Example Usage
See output section
list_of_SNPs = ["rs397509431", "rs28936375", "rs72556554" "rs558269137", "rs779175681", "rs144989330", "rs74315294", "rs200980825"]
output = quick_vep(list_of_SNPs, # INPUT IS A LIST OF SNP rsIDs
output = "df", # OUTPUT IS EITHER "dict" or "df"
complete = False, # Use complete = True for more informations and annotations
assembly = "hg19") # Select Assembly
print(output)
Example of Output
id | seq_region_name | start | end | strand | allele_string | … | impact |
---|---|---|---|---|---|---|---|
rs397509431 | 1 | 53210912 | 53210914 | -1 | AGA/A | … | MODIFIER |
rs28936375 | 1 | 53197092 | 53197092 | 1 | C/A | … | MODIFIER |
rs779175681 | 17 | 28737363 | 28737363 | 1 | C/T | … | MODERATE |
rs144989330 | 3 | 132689076 | 132689076 | -1 | G/C | … | MODERATE |
… | … | … | … | … | … | … | … |
rs200980825 | 2 | 162277573 | 162277573 | -1 | A/G | … | MODERATE |