This function used a pre-defined known SNP of interest (for example “6_32413051_A_C”) along with pre-computed association with CpGs (typically statistical test SNP genotype vs Beta-Value) to plot the significance between each CpG and the SNP of interest.
Still slow for now.
Function Definition
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
import seaborn as sns
from numpy import log10
import myvariant
import requests
from math import ceil
def plot_methqtls(df, snp_string, p_value_column, chromosome_columns, position_column, global_size = 1):
mv = myvariant.MyVariantInfo()
def print_percent_done(index, total, bar_len=50, title='Please wait'):
'''
index is expected to be 0 based index.
0 <= index < total
from Ivan Procopovich @ https://stackoverflow.com/questions/6169217/replace-console-output-in-python
'''
percent_done = (index+1)/total*100
percent_done = round(percent_done, 1)
done = round(percent_done/(100/bar_len))
togo = bar_len-done
done_str = '█'*int(done)
togo_str = '░'*int(togo)
print(f'\t⏳{title}: [{done_str}{togo_str}] {percent_done}% done', end='\r')
if index + 1 == total:
print('\t✅')
def closest_threshold(value):
list_of_thresholds = []
for index in range(1, 9):
list_of_thresholds.append(1 * (10**-index))
return min(list_of_thresholds, key=lambda x:abs(x-value))
cmap = matplotlib.cm.get_cmap('Reds')
def norm_series(serie):
serie = serie.apply(lambda x: closest_threshold(x)) #CLOSEST THRESHOLD
serie = -log10(serie)
max = serie.max()
min = serie.min()
new_series = serie.apply(lambda x: ((x - min)/(max-min)))
return new_series
list_of_thresholds = []
for index in range(1, 9):
list_of_thresholds.append(1 * (10**-index))
transformed_thresh = norm_series(pd.Series(list_of_thresholds))
#print(transformed_thresh)
df_holder = []
pos_holder = []
snp_dict = {}
chromosome_to_study = int(snp_string.split("_")[0])
df = df[df[chromosome_columns] == chromosome_to_study]
if not df.empty:
cpg_genes = list(set(df["gene"]))
cpg_genes = [x for x in cpg_genes if str(x) != "nan"]
#df_holder.append(df)
snp = snp_string.split("_")
position_of_snp = int(snp[1])
pos_holder.append(position_of_snp)
snp_to_search = "chr" + snp[0] + ":g." + snp[1] + "" + snp[2] + ">" + snp[3]
snp = snp[0] + ":" + snp[1] + ":" + snp[2] + ">" + snp[3]
snp_dict[snp] = {"pos": position_of_snp,
"df":df,
"snp_id":snp_to_search,
"cpg_genes": cpg_genes}
#MAX P-VALUE / -log10
# 0.0499965573704969 / 1.3010598989933826
# MIN P-VALUE / -log10
# 1.93e-294 / 293.7144426909922
x_limits = [min(pos_holder) - 150000, max(pos_holder) + 150000]
for snp_name in snp_dict:
#print(snp_name)
chrom = snp_name.split(":")[0]
df = snp_dict[snp_name]["df"]
df["-log10p_scaled"] = norm_series(df[p_value_column])
snp_to_search = snp_dict[snp_name]["snp_id"]
#print(snp)
results = mv.getvariant(snp_to_search)
#print("SNP SEARCHED")
try:
gene_related_to_snp = results["dbsnp"]["gene"]["symbol"]
except (KeyError, TypeError) as e:
gene_related_to_snp = ""
#MAKE A CIRCLE ON SNP LEVEL
fig, ax = plt.subplots()
x_limits = [df[position_column].min(), df[position_column].max()]
#print("CPG LOOPING")
x = 0
y = df.shape[0]
for cpg in df.index:
print_percent_done(x, y, title = "Plotting CpGs and Links")
row = df.loc[cpg]
cpg_pos = int(row[position_column])
#POINT FOR EACH CPG
x_values = [snp_dict[snp_name]["pos"], cpg_pos]
y_values = [1, 2]
rgba = cmap(row["-log10p_scaled"])
plt.plot(x_values, y_values, linewidth=1, c = rgba, antialiased=True)
plt.scatter(cpg_pos, 2, s=15, c="green")
x+=1
cpg_spanning_genes = snp_dict[snp_name]["cpg_genes"]
#print("START LOOPING OVER GENES")
x = 0
y = len(cpg_spanning_genes)
for gene in cpg_spanning_genes:
print_percent_done(x, y, title = "Plotting Gene Boundaries")
server = "https://grch37.rest.ensembl.org"
ext = "/lookup/symbol/homo_sapiens/{}?expand=0".format(gene)
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
if r.ok:
decoded = r.json()
start = decoded["start"]
end = decoded["end"]
strand = decoded["strand"]
if strand == -1:
rect = Rectangle((start, 2.05), (end-start), 0.05, linewidth=1, edgecolor='r', facecolor='lightcoral', alpha = 0.75) #xy is bottom right based
ax.add_patch(rect)
plt.text(x = (start + (end-start)/2), y = 2.075, s=gene, horizontalalignment = "center", verticalalignment = "center", size = global_size * 8)
elif strand == 1:
rect = Rectangle((start, 2.11), (end-start), 0.05, linewidth=1, edgecolor='b', facecolor='skyblue', alpha = 0.75) #xy is bottom right based
ax.add_patch(rect)
plt.text(x = (start + (end-start)/2), y = 2.135, s=gene, horizontalalignment = "center", verticalalignment = "center", size = global_size * 8)
x+=1
x_limits = ax.get_xlim()
plt.hlines(1, x_limits[0], x_limits[1], color="black") # SNP LEVEL
plt.hlines(2, x_limits[0], x_limits[1], color="black") # CPG LEVEL
plt.scatter(snp_dict[snp_name]["pos"], 1, s=20, c="red")
for cpg in df.index:
row = df.loc[cpg]
cpg_pos = int(row[position_column])
plt.scatter(cpg_pos, 2, s=20, c="green")
#CUSTOM LEGEND
# 0 0.000000
# 1 0.142857
# 2 0.285714
# 3 0.428571
# 4 0.571429
# 5 0.714286
# 6 0.857143
# 7 1.000000
# [0.1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07, 1e-08]
legend_elements = [Line2D([0], [0], color= cmap(0.000000), lw=4, label='p < 0.1'),
Line2D([0], [0], color= cmap(0.142857), lw=4, label='p < 0.01'),
Line2D([0], [0], color= cmap(0.285714), lw=4, label='p < 0.001 '),
Line2D([0], [0], color= cmap(0.428571), lw=4, label='p < 0.0001'),
Line2D([0], [0], color= cmap(0.571429), lw=4, label='p < 1e-05'),
Line2D([0], [0], color= cmap(0.714286), lw=4, label='p < 1e-06'),
Line2D([0], [0], color= cmap(0.857143), lw=4, label='p < 1e-07'),
Line2D([0], [0], color= cmap(1.000000), lw=4, label='p < 1e-08')]
plt.text(x = snp_dict[snp_name]["pos"], y = 0.96, s=gene_related_to_snp, horizontalalignment = "center") # SNP GENE NAME
plt.title("mQTL {}".format(snp_name))
plt.xlabel("Chr {} coordinates".format(chrom))
plt.yticks([1,2,2.075, 2.135 ], ["SNP", "CpGs", "- Strand", "+ Strand"])
ax.legend(handles=legend_elements, loc=0, bbox_to_anchor=(1, 1))
ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
#ax.ticklabel_format(style='plain')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
#plt.show()
fig = plt.gcf()
fig.set_size_inches(15, 5)
#plt.savefig("/Volumes/ax312/mQTL/PRS_SNP_CPG/Per_SNP/Figure_single_mQTL/mqtl_{}.png".format(snp_name.replace(":", "_"), dpi=1000))
#plt.clf()
plt.tight_layout()
plt.show()
Example Usage
See output section
df = pd.read_csv("input_mqtl.csv", index_col=0)
plot_methqtls(df = df, # Input Dataframe
snp_string = "6_32413051_A_C", #SNP of interest info, separated by underscore format: chr_pos_ref_alt
p_value_column = "Pr(>F)", # Name of p value column
chromosome_columns = "chr", # Name of Chromosome column
position_column = "pos", # Name of Position Columns
global_size = 2) # Global Size of texts
Example of Input
CpG | p | chr | pos | gene |
---|---|---|---|---|
cg18244508 | 4.79E-05 | 6 | 32172871 | NOTCH4 |
cg22863148 | 6.87E-06 | 6 | 32339554 | C6orf10 |
cg01626341 | 6.77E-06 | 6 | 32440987 | |
cg24147543 | 4.72E-72 | 6 | 32554481 | HLA-DRB1 |
cg12909670 | 7.44E-07 | 6 | 32732978 | |
cg03465320 | 0.000384507 | 6 | 32823078 | PSMB9 |
cg21597601 | 9.99E-05 | 6 | 32380505 | |
cg23117647 | 6.72E-05 | 6 | 32712930 | HLA-DQA2 |
cg14614539 | 8.45E-13 | 6 | 32170458 | NOTCH4 |
cg09798996 | 0.000325866 | 6 | 32182326 | NOTCH4 |
cg25949002 | 1.34E-09 | 6 | 32301073 | C6orf10 |
cg18067840 | 5.97E-13 | 6 | 32375672 | BTNL2 |
cg18060330 | 0.000895304 | 6 | 32376066 | BTNL2 |
cg22425813 | 8.26E-36 | 6 | 32411670 | HLA-DRA |
cg03607220 | 1.98E-30 | 6 | 32526263 | HLA-DRB6 |