Function Definition

This function will take a list of SNPs rsIDs (i.e. “rs56116432”) and output some relevant annotations such as gene, consequence, etc. Limited to 200 SNPs per call. Uses the ENSEMBL rest API

import requests, json
import pandas as pd

def quick_vep(list_of_rsid, output = "df", complete = False, assembly = "hg19"):
  all_options = ["Blosum62", "CADD", "DisGeNET", "GeneSplicer", "LoF", "Mastermind", "MaxEntScan", "Phenotypes", "SpliceAI", "SpliceRegion", "appris", "canonical", "ccds", "dbscSNV", "domains", "hgvs", "miRNA", "numbers", "protein", "refseq", "uniprot", "variant_class", "vcf_string", "xref_refseq"]
  
  if isinstance(list_of_rsid, str): # IF INPUT IS A SINGLE SNP as A STRING
      list_of_rsid = [list_of_rsid]
  if isinstance(list_of_rsid, list): # LIMIT TO 200 SNPs
      if len(list_of_rsid) > 200:
          print("Please limit your requests to 200 variants")
          return
  
  if assembly == "hg38":
        server = "https://rest.ensembl.org"
  elif assembly == "hg19":
        server = "https://grch37.rest.ensembl.org"
  else:
      print("Assembly must be either 'hg38' or 'hg19' ")
      return
  ext = "/vep/human/id"
  headers={ "Content-Type" : "application/json",  "Accept" : "application/json"}

  # Target Format is '{ "ids" : ["rs56116432", "COSM476" ] }'
  #FORMAT THE BODY OF THE MESSAGE
  variant_dict = {"ids" : list_of_rsid}
  if complete:
      for option in all_options:
          variant_dict[option] = 1
  data = json.dumps(variant_dict)
  
  # POST THE REQUEST
  r = requests.post(server+ext, headers=headers, data=data)
  
  if r.status_code == 200: # IF CODE = OK
      if r.json() != None: # IF SOMETHING IS ACTUALLY RETURNED
          return_dict = r.json()
          
          # PROCESS THE DICTIONNARY SO IT HAS CONSISTENT ENTRIES EVEN IF MISSING
          for dictionnary in return_dict:
              try:
                  dictionnary["transcript_consequences"][0]
              except KeyError:
                  dictionnary["transcript_consequences"] = [{}]
              try:
                  dictionnary["regulatory_feature_consequences"][0]
              except KeyError:
                  dictionnary["regulatory_feature_consequences"] = [{}]
          
          #Process Dict
          for rsidict in return_dict:
              # EXPAND SUB-DICTIONARIES into MAIN DICTIONARIES
              for key, value in rsidict["transcript_consequences"][0].items():
                  rsidict[key] = value
              for key, value in rsidict["regulatory_feature_consequences"][0].items():
                  rsidict[key] = value
              # REMOVE THE EXPANDED SECTIONS
              rsidict.pop("transcript_consequences", None)
              rsidict.pop("regulatory_feature_consequences", None)
              rsidict.pop("colocated_variants", None)
              rsidict.pop("motif_feature_consequences", None)
          # SELECT OUTPUT TYPE
          if output == "dict":
              return return_dict
          elif (output == "df") or (output == "dataframe"):
              df = pd.DataFrame.from_dict(return_dict)
              df.index = df.id
              # REORDER to get info at the beginning
              old_columns = list(df.columns)
              new_columns = ["seq_region_name", "start", "end", "strand", "allele_string"] + [x for x in old_columns if x not in ["seq_region_name", "start", "end", "strand", "allele_string"]]
              df = df[new_columns]
              return df
      else: # IF RESPONSE IS EMPTY JSON 
          print("Error: Empty Response, try less SNPs or check if the rsIDs exist")
          return
  else: # RETURN ERROR CODE
      print("Error {} during request".format(r.status_code))
      return 

Example Usage

See output section


list_of_SNPs = ["rs397509431", "rs28936375", "rs72556554" "rs558269137", "rs779175681", "rs144989330", "rs74315294", "rs200980825"]

output =  quick_vep(list_of_SNPs, # INPUT IS A LIST OF SNP rsIDs
                              output = "df", # OUTPUT IS EITHER "dict" or "df" 
                              complete = False,  # Use complete = True for more informations and annotations
                              assembly = "hg19") # Select Assembly

print(output)

Example of Output

id seq_region_name start end strand allele_string impact
rs397509431 1 53210912 53210914 -1 AGA/A MODIFIER
rs28936375 1 53197092 53197092 1 C/A MODIFIER
rs779175681 17 28737363 28737363 1 C/T MODERATE
rs144989330 3 132689076 132689076 -1 G/C MODERATE
rs200980825 2 162277573 162277573 -1 A/G MODERATE