methQTLs Plotting

This function used a pre-defined known SNP of interest (for example “6_32413051_A_C”) along with pre-computed association with CpGs (typically statistical test SNP genotype vs Beta-Value) to plot the significance between each CpG and the SNP of interest.

Still slow for now.

Function Definition

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
import seaborn as sns
from numpy import log10
import myvariant
import requests
from math import ceil

def plot_methqtls(df, snp_string, p_value_column, chromosome_columns, position_column, global_size = 1):
    mv = myvariant.MyVariantInfo()

    def print_percent_done(index, total, bar_len=50, title='Please wait'):
        '''
        index is expected to be 0 based index. 
        0 <= index < total
        from Ivan Procopovich @ https://stackoverflow.com/questions/6169217/replace-console-output-in-python
        
        '''
        percent_done = (index+1)/total*100
        percent_done = round(percent_done, 1)

        done = round(percent_done/(100/bar_len))
        togo = bar_len-done

        done_str = '█'*int(done)
        togo_str = '░'*int(togo)

        print(f'\t⏳{title}: [{done_str}{togo_str}] {percent_done}% done', end='\r')

        if index + 1 == total:
            print('\t✅')


    def closest_threshold(value):
        list_of_thresholds = []
        for index in range(1, 9):
            list_of_thresholds.append(1 * (10**-index))
        return min(list_of_thresholds, key=lambda x:abs(x-value))

    cmap = matplotlib.cm.get_cmap('Reds')

    def norm_series(serie):
        serie = serie.apply(lambda x: closest_threshold(x)) #CLOSEST THRESHOLD
        serie = -log10(serie)
        max = serie.max()
        min = serie.min()
        new_series = serie.apply(lambda x: ((x - min)/(max-min)))
        return new_series
    
    list_of_thresholds = []
    for index in range(1, 9):
        list_of_thresholds.append(1 * (10**-index))
    transformed_thresh = norm_series(pd.Series(list_of_thresholds))
    #print(transformed_thresh)



    df_holder = []
    pos_holder = []
    snp_dict = {}
    chromosome_to_study = int(snp_string.split("_")[0])

    df = df[df[chromosome_columns] == chromosome_to_study]

    if not df.empty:
        cpg_genes = list(set(df["gene"]))
        cpg_genes = [x for x in cpg_genes if str(x) != "nan"]
        #df_holder.append(df)
        snp = snp_string.split("_")
        position_of_snp = int(snp[1])
        pos_holder.append(position_of_snp)
        snp_to_search = "chr" + snp[0] + ":g." + snp[1] + "" + snp[2] + ">" + snp[3]
        snp = snp[0] + ":" + snp[1] + ":" + snp[2] + ">" + snp[3]
        snp_dict[snp] = {"pos": position_of_snp,
                         "df":df,
                         "snp_id":snp_to_search,
                         "cpg_genes": cpg_genes}
    #MAX P-VALUE / -log10
    # 0.0499965573704969 / 1.3010598989933826
    # MIN P-VALUE / -log10
    # 1.93e-294 / 293.7144426909922

    x_limits = [min(pos_holder) - 150000, max(pos_holder) + 150000]

    for snp_name in snp_dict:
        #print(snp_name)
        chrom = snp_name.split(":")[0]
        df = snp_dict[snp_name]["df"]
        df["-log10p_scaled"] = norm_series(df[p_value_column])
        snp_to_search = snp_dict[snp_name]["snp_id"]
        #print(snp)
        results = mv.getvariant(snp_to_search)
        #print("SNP SEARCHED")
        try:
            gene_related_to_snp = results["dbsnp"]["gene"]["symbol"]
        except (KeyError, TypeError) as e:
            gene_related_to_snp = ""
        #MAKE A CIRCLE ON SNP LEVEL
        fig, ax = plt.subplots()
        x_limits = [df[position_column].min(), df[position_column].max()]
        #print("CPG LOOPING")
        x = 0
        y = df.shape[0]
        for cpg in df.index:
            print_percent_done(x, y, title = "Plotting CpGs and Links")
            row = df.loc[cpg]
            cpg_pos = int(row[position_column])
            #POINT FOR EACH CPG
            x_values = [snp_dict[snp_name]["pos"], cpg_pos]
            y_values = [1, 2]
            rgba = cmap(row["-log10p_scaled"])
            plt.plot(x_values, y_values, linewidth=1, c = rgba, antialiased=True)
            plt.scatter(cpg_pos, 2, s=15, c="green")
            x+=1

        cpg_spanning_genes = snp_dict[snp_name]["cpg_genes"]
        #print("START LOOPING OVER GENES")
        x = 0 
        y = len(cpg_spanning_genes)
        for gene in cpg_spanning_genes:
            print_percent_done(x, y, title = "Plotting Gene Boundaries")
            server = "https://grch37.rest.ensembl.org"
            ext = "/lookup/symbol/homo_sapiens/{}?expand=0".format(gene)
            r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
            if r.ok:
                decoded = r.json()
                start = decoded["start"]
                end = decoded["end"]
                strand = decoded["strand"]
                if strand == -1:
                    rect = Rectangle((start, 2.05), (end-start), 0.05, linewidth=1, edgecolor='r', facecolor='lightcoral', alpha = 0.75) #xy is bottom right based
                    ax.add_patch(rect)
                    plt.text(x = (start + (end-start)/2), y = 2.075, s=gene, horizontalalignment  = "center", verticalalignment = "center", size = global_size * 8)
                elif strand == 1:
                    rect = Rectangle((start, 2.11), (end-start), 0.05, linewidth=1, edgecolor='b', facecolor='skyblue', alpha = 0.75) #xy is bottom right based
                    ax.add_patch(rect)
                    plt.text(x = (start + (end-start)/2), y = 2.135, s=gene, horizontalalignment  = "center", verticalalignment = "center", size = global_size * 8)
            x+=1
            
        x_limits = ax.get_xlim()
        plt.hlines(1, x_limits[0], x_limits[1], color="black") # SNP LEVEL
        plt.hlines(2, x_limits[0], x_limits[1], color="black") # CPG LEVEL

        plt.scatter(snp_dict[snp_name]["pos"], 1, s=20, c="red")

        for cpg in df.index:
            row = df.loc[cpg]
            cpg_pos = int(row[position_column])
            plt.scatter(cpg_pos, 2, s=20, c="green")

        #CUSTOM LEGEND
        # 0    0.000000
        # 1    0.142857
        # 2    0.285714
        # 3    0.428571
        # 4    0.571429
        # 5    0.714286
        # 6    0.857143
        # 7    1.000000

        # [0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07, 1e-08]
        legend_elements = [Line2D([0], [0], color= cmap(0.000000), lw=4, label='p < 0.1'),
                           Line2D([0], [0], color= cmap(0.142857), lw=4, label='p < 0.01'),
                           Line2D([0], [0], color= cmap(0.285714), lw=4, label='p < 0.001 '),
                           Line2D([0], [0], color= cmap(0.428571), lw=4, label='p < 0.0001'),
                           Line2D([0], [0], color= cmap(0.571429), lw=4, label='p < 1e-05'),
                           Line2D([0], [0], color= cmap(0.714286), lw=4, label='p < 1e-06'),
                           Line2D([0], [0], color= cmap(0.857143), lw=4, label='p < 1e-07'),
                           Line2D([0], [0], color= cmap(1.000000), lw=4, label='p < 1e-08')]

        plt.text(x = snp_dict[snp_name]["pos"], y = 0.96, s=gene_related_to_snp, horizontalalignment  = "center") # SNP GENE NAME
        plt.title("mQTL {}".format(snp_name))
        plt.xlabel("Chr {} coordinates".format(chrom))
        plt.yticks([1,2,2.075, 2.135 ], ["SNP", "CpGs", "- Strand", "+ Strand"])
        ax.legend(handles=legend_elements, loc=0, bbox_to_anchor=(1, 1))
        ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        #ax.ticklabel_format(style='plain')
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        #plt.show()

        fig = plt.gcf()
        fig.set_size_inches(15, 5)
        #plt.savefig("/Volumes/ax312/mQTL/PRS_SNP_CPG/Per_SNP/Figure_single_mQTL/mqtl_{}.png".format(snp_name.replace(":", "_"), dpi=1000))
        #plt.clf()
        plt.tight_layout()
        plt.show()

Example Usage

See output section

df = pd.read_csv("input_mqtl.csv", index_col=0)

plot_methqtls(df = df, # Input Dataframe
              snp_string = "6_32413051_A_C", #SNP of interest info, separated by underscore format:  chr_pos_ref_alt
              p_value_column = "Pr(>F)", # Name of p value column
              chromosome_columns = "chr", # Name of Chromosome column
              position_column = "pos", # Name of Position Columns
              global_size = 2) # Global Size of texts

Example of Input

CpG	p	chr	pos	gene
cg18244508	4.79E-05	6	32172871	NOTCH4
cg22863148	6.87E-06	6	32339554	C6orf10
cg01626341	6.77E-06	6	32440987
cg24147543	4.72E-72	6	32554481	HLA-DRB1
cg12909670	7.44E-07	6	32732978
cg03465320	0.000384507	6	32823078	PSMB9
cg21597601	9.99E-05	6	32380505
cg23117647	6.72E-05	6	32712930	HLA-DQA2
cg14614539	8.45E-13	6	32170458	NOTCH4
cg09798996	0.000325866	6	32182326	NOTCH4
cg25949002	1.34E-09	6	32301073	C6orf10
cg18067840	5.97E-13	6	32375672	BTNL2
cg18060330	0.000895304	6	32376066	BTNL2
cg22425813	8.26E-36	6	32411670	HLA-DRA
cg03607220	1.98E-30	6	32526263	HLA-DRB6

Example of Output

Using this csv file ManPlot output