Function Definition
def manplot(df, p_col, chr_col, pos_col, log_col = None, gene_col = None, p_threshold = None, gene_highlights = None, size = 1, colors = ["darkgray", "black"]):
#ASSERTIONS
assert type(df) == pd.DataFrame, "INPUT DATAFRAME MUST BE A PANDAS DATAFRAME"
####################
### RUN FUNCTION ###
####################
# REPLACE X Y MT M by 23, 24, 25
df[chr_col] = df[chr_col].replace("chr", "") # REPLACE chr if chr in contig name
df[chr_col] = df[chr_col].replace(["X", "Y", "M", "MT"], [23, 24, 25, 25])
df[chr_col] = df[chr_col].astype("int")
# SORT BY COORDINATES
df = df.sort_values(by = [chr_col, pos_col], ascending= (True, True))
# GET SIGNIFICANCE COLUMNS
if log_col:
sig_col = df[log_col]
else:
sig_col = - log10(df[p_col])
df["-log10(p)"] = sig_col
# GROUP BY CHROMOSOMES
groupped = df.groupby(by = chr_col)
new_start = 0
#LOOP OVER ALL CHROMOSOMES
for tup in groupped:
chrom = tup[0]
chrom_df = tup[1]
position_series = chrom_df[pos_col]
#GET START AND END OF CURRENT CHROMOSOME
start = position_series.min()
end = position_series.max()
#UPDATE THE START WITH END OF PREVIOUS CHR
x_series = position_series + new_start
#GET LOG10 COLUMNS
effect_series = chrom_df["-log10(p)"]
# PLOT DIFFERENT COLORS FOR ALTERNATING CHROMOSOMES
if chrom % 2 == 0:
sns.scatterplot(x = x_series, y = effect_series, color = colors[0], s=1*size)
else:
sns.scatterplot(x = x_series, y = effect_series, color = colors[1], s=1*size)
# FIND MIDDLE OF POSITION TO PRINT CHROMOSOME NUMBER
chrom_label_x = (position_series.max() -position_series.min())/2 + new_start
chrom_label_y = - 0.5
plt.text(chrom_label_x,chrom_label_y, "{}".format(chrom).replace("23", "X").replace("24", "Y").replace("25", "M"))
# HIGHLIGHT SIGNIFICANT ONES (-logp > 4)
if p_threshold:
sig_df = chrom_df[["-log10(p)", pos_col]]
sig_df = sig_df[sig_df["-log10(p)"] >= -log10(p_threshold)]
sig_effect = sig_df["-log10(p)"]
sig_pos = sig_df[pos_col] + new_start
sns.scatterplot(x = sig_pos, y = sig_effect, color = "red", s=3*size)
# HIGHLIGHT GENES
if gene_highlights:
for gene in gene_highlights:
if gene in list(set(chrom_df[gene_col])):
gene_df = chrom_df[chrom_df[gene_col] == gene]
sns.scatterplot(x = gene_df[pos_col] + new_start, y = gene_df["-log10(p)"], color = "green", s= 3*size)
plt.text( list(gene_df[pos_col])[0] + new_start, gene_df["-log10(p)"].max(), gene , horizontalalignment='left', size=5*size, color='blue', weight='semibold')
# UPDATE THE NEW STARTING POINT OF NEXT CHROMOSOME
new_start = new_start + end
# PAINT SIGNIFICANCE LINE
if p_threshold:
plt.hlines(-log10(p_threshold), 0, new_start, color = "k", alpha = 0.75, linestyles = "dotted")
sns.despine(top = True, left = False, bottom = False, right = True, trim=True)
plt.xticks([])
plt.xlabel("Chromosome")
plt.show()
Example Usage
See output section
df = pd.read_csv("input_manplot.csv", index_col = 0)
manplot(df = df, # Dataframe containing all data
p_col = "unadjusted_p", # Column name containing unadjusted p values
chr_col = "Chromosome", # Column name containing Chromosome names
pos_col = "Position", # Column name containing position in base pair
size=2, # Relative size of all elements of plot
p_threshold=0.0000000005, # SIgnificance threshold (unadjusted)
gene_col="Gene", # Column name containing Gene Names
gene_highlights=["HLA-DRB5"], # Highlight SNPs from thoses genes (list)
colors=["blue", "red"]) # Alternating Colors between chromosomes
Example of Input
SNP name | Beta | unadjusted_p | Chromosome | Position | Gene | FDR_p | log10p |
---|---|---|---|---|---|---|---|
SNP_1 | -0.040574534 | 0.800593278 | 20 | 61847650 | YTHDF1 | 0.931370854 | 0.096588061 |
SNP_2 | -0.174536119 | 0.64720772 | X | 24072640 | EIF2S3 | 0.862909933 | 0.188956311 |
SNP_3 | -0.580128227 | 0.014331882 | 9 | 131463936 | PKN3 | 0.110293794 | 1.843696774 |
SNP_4 | -0.405584529 | 0.186467648 | 17 | 80159506 | CCDC57 | 0.503343058 | 0.729396508 |
SNP_5 | 1.169605031 | 0.001096328 | 14 | 105176736 | INF2 | 0.017841777 | 2.960059336 |
SNP_6 | 0.337168957 | 0.338987128 | 13 | 115000168 | CDC16 | 0.664430503 | 0.469816793 |
SNP_7 | 0.117928214 | 0.765498459 | X | 38660511 | MID1IP1 | 0.916440534 | 0.116055679 |
SNP_8 | 0.715705003 | 0.067896472 | X | 14891349 | MOSPD2 | 0.292863934 | 1.168152793 |
SNP_9 | -0.405816494 | 0.600192762 | 12 | 12849159 | GPR19 | 0.839370488 | 0.221709246 |
SNP_10 | -0.063660205 | 0.650169775 | 8 | 74791285 | UBE2W | 0.864326097 | 0.186973224 |
SNP_11 | -0.08868857 | 0.706600784 | 19 | 3676340 | PIP5K1C | 0.890516986 | 0.150825885 |
SNP_12 | -0.303048926 | 0.183223188 | 5 | 65222326 | ERBB2IP | 0.498973046 | 0.737019566 |
SNP_13 | -0.437425635 | 0.328402247 | 5 | 140242373 | PCDHA6 | 0.655186991 | 0.48359388 |
SNP_14 | 0.029488675 | 0.874794458 | 2 | 220197274 | RESP18 | 0.959027098 | 0.058093977 |
SNP_15 | 0.45811717 | 0.01331152 | 7 | 933891 | C7orf20 | 0.10493113 | 1.875772362 |
SNP_16 | 0.410095604 | 0.143429657 | 6 | 10390961 | 0.441421759 | 0.843361041 | |
SNP_17 | 0.119337457 | 0.624321762 | 1 | 43855438 | MED8 | 0.851375741 | 0.204591526 |
SNP_18 | -0.204156259 | 0.042717493 | 4 | 103266227 | SLC39A8 | 0.223020875 | 1.369394238 |
SNP_19 | 0.117459133 | 0.535186888 | 1 | 154297735 | ATP8B2 | 0.803773117 | 0.271494536 |
SNP_20 | -0.178724544 | 0.656054831 | 19 | 1287832 | EFNA2 | 0.867347228 | 0.183059862 |
… | … | … | … | … | … | … | … |
SNP_N | 0.514544394 | 0.150683713 | 1 | 9775985 | PIK3CD | 0.452800177 | 0.821933686 |