Function Definition

def manplot(df, p_col, chr_col, pos_col, log_col = None, gene_col = None, p_threshold = None, gene_highlights = None, size = 1, colors = ["darkgray", "black"]):
    #ASSERTIONS 
    assert type(df) == pd.DataFrame, "INPUT DATAFRAME MUST BE A PANDAS DATAFRAME"

    ####################
    ### RUN FUNCTION ###
    ####################

    # REPLACE X Y MT M by 23, 24, 25
    df[chr_col] = df[chr_col].replace("chr", "") # REPLACE chr if chr in contig name
    df[chr_col] = df[chr_col].replace(["X", "Y", "M", "MT"], [23, 24, 25, 25])
    df[chr_col] = df[chr_col].astype("int")


    # SORT BY COORDINATES
    df = df.sort_values(by = [chr_col, pos_col], ascending= (True, True))

    # GET SIGNIFICANCE COLUMNS
    if log_col:
        sig_col = df[log_col]
    else:
        sig_col =  - log10(df[p_col])
    df["-log10(p)"] = sig_col

    # GROUP BY CHROMOSOMES
    groupped = df.groupby(by = chr_col)
    new_start = 0

    #LOOP OVER ALL CHROMOSOMES
    for tup in groupped:
        chrom = tup[0]
        chrom_df = tup[1]

        position_series = chrom_df[pos_col]
        #GET START AND END OF CURRENT CHROMOSOME
        start = position_series.min()
        end = position_series.max()
        
        #UPDATE THE START WITH END OF PREVIOUS CHR
        x_series = position_series + new_start
        
        #GET LOG10 COLUMNS
        effect_series = chrom_df["-log10(p)"]
        
        # PLOT DIFFERENT COLORS FOR ALTERNATING CHROMOSOMES
        if chrom % 2 == 0:
            sns.scatterplot(x = x_series, y = effect_series, color = colors[0],  s=1*size)
        else:
            sns.scatterplot(x = x_series, y = effect_series, color = colors[1],  s=1*size)
            
        # FIND MIDDLE OF POSITION TO PRINT CHROMOSOME NUMBER 
        chrom_label_x =  (position_series.max() -position_series.min())/2 + new_start
        chrom_label_y = - 0.5
        
        plt.text(chrom_label_x,chrom_label_y, "{}".format(chrom).replace("23", "X").replace("24", "Y").replace("25", "M"))

        # HIGHLIGHT SIGNIFICANT ONES (-logp > 4)
        if p_threshold:
            sig_df = chrom_df[["-log10(p)", pos_col]]
            sig_df = sig_df[sig_df["-log10(p)"] >= -log10(p_threshold)]
            sig_effect = sig_df["-log10(p)"]
            sig_pos = sig_df[pos_col] + new_start
            sns.scatterplot(x = sig_pos, y = sig_effect, color = "red",  s=3*size)
        
        # HIGHLIGHT GENES
        if gene_highlights:
            for gene in gene_highlights:
                if gene in list(set(chrom_df[gene_col])):
                    gene_df = chrom_df[chrom_df[gene_col] == gene]

                    sns.scatterplot(x = gene_df[pos_col] + new_start, y = gene_df["-log10(p)"], color = "green", s= 3*size)
                    plt.text( list(gene_df[pos_col])[0] + new_start, gene_df["-log10(p)"].max(), gene , horizontalalignment='left', size=5*size, color='blue', weight='semibold')

        # UPDATE THE NEW STARTING POINT OF NEXT CHROMOSOME
        new_start = new_start + end
        
        # PAINT SIGNIFICANCE LINE
        if p_threshold:
            plt.hlines(-log10(p_threshold), 0, new_start, color = "k", alpha = 0.75, linestyles = "dotted")
            
    sns.despine(top = True, left = False, bottom = False, right = True, trim=True)
    plt.xticks([])
    plt.xlabel("Chromosome")
    plt.show()

Example Usage

See output section

df = pd.read_csv("input_manplot.csv", index_col = 0)

manplot(df = df, # Dataframe containing all data
        p_col = "unadjusted_p", # Column name containing unadjusted p values
        chr_col = "Chromosome", # Column name containing Chromosome names
        pos_col = "Position", # Column name containing position  in base pair
        size=2, # Relative size of all elements of plot
        p_threshold=0.0000000005, # SIgnificance threshold (unadjusted)
        gene_col="Gene", # Column name containing Gene Names
        gene_highlights=["HLA-DRB5"], # Highlight SNPs from thoses genes (list)
        colors=["blue", "red"]) # Alternating Colors between chromosomes

Example of Input

SNP name Beta unadjusted_p Chromosome Position Gene FDR_p log10p
SNP_1 -0.040574534 0.800593278 20 61847650 YTHDF1 0.931370854 0.096588061
SNP_2 -0.174536119 0.64720772 X 24072640 EIF2S3 0.862909933 0.188956311
SNP_3 -0.580128227 0.014331882 9 131463936 PKN3 0.110293794 1.843696774
SNP_4 -0.405584529 0.186467648 17 80159506 CCDC57 0.503343058 0.729396508
SNP_5 1.169605031 0.001096328 14 105176736 INF2 0.017841777 2.960059336
SNP_6 0.337168957 0.338987128 13 115000168 CDC16 0.664430503 0.469816793
SNP_7 0.117928214 0.765498459 X 38660511 MID1IP1 0.916440534 0.116055679
SNP_8 0.715705003 0.067896472 X 14891349 MOSPD2 0.292863934 1.168152793
SNP_9 -0.405816494 0.600192762 12 12849159 GPR19 0.839370488 0.221709246
SNP_10 -0.063660205 0.650169775 8 74791285 UBE2W 0.864326097 0.186973224
SNP_11 -0.08868857 0.706600784 19 3676340 PIP5K1C 0.890516986 0.150825885
SNP_12 -0.303048926 0.183223188 5 65222326 ERBB2IP 0.498973046 0.737019566
SNP_13 -0.437425635 0.328402247 5 140242373 PCDHA6 0.655186991 0.48359388
SNP_14 0.029488675 0.874794458 2 220197274 RESP18 0.959027098 0.058093977
SNP_15 0.45811717 0.01331152 7 933891 C7orf20 0.10493113 1.875772362
SNP_16 0.410095604 0.143429657 6 10390961   0.441421759 0.843361041
SNP_17 0.119337457 0.624321762 1 43855438 MED8 0.851375741 0.204591526
SNP_18 -0.204156259 0.042717493 4 103266227 SLC39A8 0.223020875 1.369394238
SNP_19 0.117459133 0.535186888 1 154297735 ATP8B2 0.803773117 0.271494536
SNP_20 -0.178724544 0.656054831 19 1287832 EFNA2 0.867347228 0.183059862
SNP_N 0.514544394 0.150683713 1 9775985 PIK3CD 0.452800177 0.821933686

Example of Output

Using this csv file ManPlot output