Manhattan Plots

Function Definition

def manplot(df, p_col, chr_col, pos_col, log_col = None, gene_col = None, p_threshold = None, gene_highlights = None, size = 1, colors = ["darkgray", "black"]):
    #ASSERTIONS 
    assert type(df) == pd.DataFrame, "INPUT DATAFRAME MUST BE A PANDAS DATAFRAME"

    ####################
    ### RUN FUNCTION ###
    ####################

    # REPLACE X Y MT M by 23, 24, 25
    df[chr_col] = df[chr_col].replace("chr", "") # REPLACE chr if chr in contig name
    df[chr_col] = df[chr_col].replace(["X", "Y", "M", "MT"], [23, 24, 25, 25])
    df[chr_col] = df[chr_col].astype("int")


    # SORT BY COORDINATES
    df = df.sort_values(by = [chr_col, pos_col], ascending= (True, True))

    # GET SIGNIFICANCE COLUMNS
    if log_col:
        sig_col = df[log_col]
    else:
        sig_col =  - log10(df[p_col])
    df["-log10(p)"] = sig_col

    # GROUP BY CHROMOSOMES
    groupped = df.groupby(by = chr_col)
    new_start = 0

    #LOOP OVER ALL CHROMOSOMES
    for tup in groupped:
        chrom = tup[0]
        chrom_df = tup[1]

        position_series = chrom_df[pos_col]
        #GET START AND END OF CURRENT CHROMOSOME
        start = position_series.min()
        end = position_series.max()
        
        #UPDATE THE START WITH END OF PREVIOUS CHR
        x_series = position_series + new_start
        
        #GET LOG10 COLUMNS
        effect_series = chrom_df["-log10(p)"]
        
        # PLOT DIFFERENT COLORS FOR ALTERNATING CHROMOSOMES
        if chrom % 2 == 0:
            sns.scatterplot(x = x_series, y = effect_series, color = colors[0],  s=1*size)
        else:
            sns.scatterplot(x = x_series, y = effect_series, color = colors[1],  s=1*size)
            
        # FIND MIDDLE OF POSITION TO PRINT CHROMOSOME NUMBER 
        chrom_label_x =  (position_series.max() -position_series.min())/2 + new_start
        chrom_label_y = - 0.5
        
        plt.text(chrom_label_x,chrom_label_y, "{}".format(chrom).replace("23", "X").replace("24", "Y").replace("25", "M"))

        # HIGHLIGHT SIGNIFICANT ONES (-logp > 4)
        if p_threshold:
            sig_df = chrom_df[["-log10(p)", pos_col]]
            sig_df = sig_df[sig_df["-log10(p)"] >= -log10(p_threshold)]
            sig_effect = sig_df["-log10(p)"]
            sig_pos = sig_df[pos_col] + new_start
            sns.scatterplot(x = sig_pos, y = sig_effect, color = "red",  s=3*size)
        
        # HIGHLIGHT GENES
        if gene_highlights:
            for gene in gene_highlights:
                if gene in list(set(chrom_df[gene_col])):
                    gene_df = chrom_df[chrom_df[gene_col] == gene]

                    sns.scatterplot(x = gene_df[pos_col] + new_start, y = gene_df["-log10(p)"], color = "green", s= 3*size)
                    plt.text( list(gene_df[pos_col])[0] + new_start, gene_df["-log10(p)"].max(), gene , horizontalalignment='left', size=5*size, color='blue', weight='semibold')

        # UPDATE THE NEW STARTING POINT OF NEXT CHROMOSOME
        new_start = new_start + end
        
        # PAINT SIGNIFICANCE LINE
        if p_threshold:
            plt.hlines(-log10(p_threshold), 0, new_start, color = "k", alpha = 0.75, linestyles = "dotted")
            
    sns.despine(top = True, left = False, bottom = False, right = True, trim=True)
    plt.xticks([])
    plt.xlabel("Chromosome")
    plt.show()

Example Usage

See output section

df = pd.read_csv("input_manplot.csv", index_col = 0)

manplot(df = df, # Dataframe containing all data
        p_col = "unadjusted_p", # Column name containing unadjusted p values
        chr_col = "Chromosome", # Column name containing Chromosome names
        pos_col = "Position", # Column name containing position  in base pair
        size=2, # Relative size of all elements of plot
        p_threshold=0.0000000005, # SIgnificance threshold (unadjusted)
        gene_col="Gene", # Column name containing Gene Names
        gene_highlights=["HLA-DRB5"], # Highlight SNPs from thoses genes (list)
        colors=["blue", "red"]) # Alternating Colors between chromosomes

Example of Input

SNP name	Beta	unadjusted_p	Chromosome	Position	Gene	FDR_p	log10p
SNP_1	-0.040574534	0.800593278	20	61847650	YTHDF1	0.931370854	0.096588061
SNP_2	-0.174536119	0.64720772	X	24072640	EIF2S3	0.862909933	0.188956311
SNP_3	-0.580128227	0.014331882	9	131463936	PKN3	0.110293794	1.843696774
SNP_4	-0.405584529	0.186467648	17	80159506	CCDC57	0.503343058	0.729396508
SNP_5	1.169605031	0.001096328	14	105176736	INF2	0.017841777	2.960059336
SNP_6	0.337168957	0.338987128	13	115000168	CDC16	0.664430503	0.469816793
SNP_7	0.117928214	0.765498459	X	38660511	MID1IP1	0.916440534	0.116055679
SNP_8	0.715705003	0.067896472	X	14891349	MOSPD2	0.292863934	1.168152793
SNP_9	-0.405816494	0.600192762	12	12849159	GPR19	0.839370488	0.221709246
SNP_10	-0.063660205	0.650169775	8	74791285	UBE2W	0.864326097	0.186973224
SNP_11	-0.08868857	0.706600784	19	3676340	PIP5K1C	0.890516986	0.150825885
SNP_12	-0.303048926	0.183223188	5	65222326	ERBB2IP	0.498973046	0.737019566
SNP_13	-0.437425635	0.328402247	5	140242373	PCDHA6	0.655186991	0.48359388
SNP_14	0.029488675	0.874794458	2	220197274	RESP18	0.959027098	0.058093977
SNP_15	0.45811717	0.01331152	7	933891	C7orf20	0.10493113	1.875772362
SNP_16	0.410095604	0.143429657	6	10390961		0.441421759	0.843361041
SNP_17	0.119337457	0.624321762	1	43855438	MED8	0.851375741	0.204591526
SNP_18	-0.204156259	0.042717493	4	103266227	SLC39A8	0.223020875	1.369394238
SNP_19	0.117459133	0.535186888	1	154297735	ATP8B2	0.803773117	0.271494536
SNP_20	-0.178724544	0.656054831	19	1287832	EFNA2	0.867347228	0.183059862
…	…	…	…	…	…	…	…
SNP_N	0.514544394	0.150683713	1	9775985	PIK3CD	0.452800177	0.821933686

Example of Output

Using this csv file ManPlot output