Source code for snaf.gtex_viewer


 

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
import matplotlib.pyplot as plt
import anndata as ad
import seaborn as sns
from scipy.sparse import csr_matrix


import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

# gtex viewer

def gtex_viewer_configuration(adata_passed_in):
    global adata
    adata = adata_passed_in

[docs]def gtex_visual_norm_count_combined(uid,xlim=None,ylim=None,save_df=False,outdir='.'):
    '''
    This function is to visualize the normalized count (CPM) distribution,
    it should follow a half-norm distribution, this plot will help determine the choice in
    bayesian modeling for tumor specificity score

    :param uid: string, the uid for the splicing event
    :param xlim: None or tuple, whether to constrain the xlim of the histplot
    :param ylim: None or tuple, whether to constrain the ylim of the histplot
    :param save_df: boolean, whether to save a txt file reporting all the cpm values and the tissue information for debug purpose
    :param outdir: string, where the figure go into

    Example::
    
        snaf.gtex_visual_norm_count_combined(uid='ENSG00000090339:E4.3-E4.5')  
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    info = adata[[uid],:]
    scale_factor_dict = adata.var['total_count'].to_dict()
    df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names)
    df['value_cpm'] = df['value'].values / df.index.map(scale_factor_dict).values
    data = df['value_cpm'].values 
    fig, ax = plt.subplots()
    sns.histplot(data,bins=100,ax=ax)
    if xlim is not None:
        ax.set_xlim(xlim)
    if ylim is not None:
        ax.set_ylim(ylim)
    ax.set_xlabel('Count Per Million(CPM)')
    plt.savefig(os.path.join(outdir,'hist_{}.pdf'.format(uid.replace(':','_'))),bbox_inches='tight')
    plt.close()
    if save_df:
        df.to_csv(os.path.join(outdir,'cpm_{}.txt'.format(uid.replace(':','_'))),sep='\t')
    return df

[docs]def gtex_visual_per_tissue_count(uid,total_count=10, count_cutoff=1, total=25, outdir='.'):
    '''
    This function is to visualize the random variable X, which repesents the number of samples in each
    tissue type that express this junction. It should follow a zeroinflated poisson distribution. We do not 
    consider tissues with less than total_count samples, and we use count_cutoff to represent how many count can
    be defined as "expressed" versus "non-expressed". We scaled all the number of samples to the total of "total" to make sure 
    they are comparable. Because if a tissue type has 1000 samples, another has 25 samples, this random variable is not directly
    comparable. this plot will help determine the choice in bayesian modeling for tumor specificity score

    :param uid: string, the uid for the splicing event
    :param outdir: string, where the figure go into

    Example::
    
        snaf.gtex_visual_per_tissue_count(uid='ENSG00000090339:E4.3-E4.5')  
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    x = []
    for tissue in adata.var['tissue'].unique():
        sub = adata[uid,adata.var['tissue']==tissue]
        total_count = sub.shape[1]
        if total_count >= total_count:
            c = np.count_nonzero(np.where(sub.X.toarray()<count_cutoff,0,sub.X.toarray()))
            scaled_c = round(c * (total/total_count),0)
            x.append(scaled_c)
    x = np.array(x)
    fig,ax = plt.subplots()
    try:
        sns.histplot(x,binwidth=1,ax=ax)
    except:
        sns.histplot(x,ax=ax)  # if x is with no variance at all, binwidth can not be 1
    ax.set_xlabel('Number of samples expressing this target in each tissue type')
    plt.savefig(os.path.join(outdir,'tissue_dist_{}.pdf'.format(uid.replace(':','_'))),bbox_inches='tight')
    plt.close()


[docs]def gtex_visual_combine_plotly(uid,outdir='',norm=False,tumor=None):
    '''
    Generate combined plot but interactively

    :param uid: string, the uid for the splicing event
    :param norm: bool. whether normalize for the sequencing depth or not
    :param outdir: string, where the figure go into
    :param tumor: pandas dataframe, the tumor df to compare with

    Example::

        snaf.gtex_visual_combine(uid='ENSG00000090339:E4.3-E4.5',norm=True,tumor=df)  
        snaf.gtex_visual_combine(uid='ENSG00000112149:E7.1-E9.1',norm=True,tumor=df)
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    import plotly.graph_objects as go
    query = uid
    try:
        info = adata[[query],:]
    except:
        print('{} not detected in gtex or added controls, impute as zero'.format(query))
        info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var)  # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate
    title = query
    identifier = query.replace(':','_')
    df_gtex = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names)
    if norm:
        df_gtex['value'] = df_gtex['value'] / (info.var['total_count'])
    if tumor is not None:
        tumor_query_value = tumor.loc[query,:].values.squeeze()
        if norm:
            tumor_total_count = tumor.sum(axis=0)
            tumor_query_value = tumor_query_value / (tumor_total_count.values.squeeze()/1e6)
        tumor_sub_df = pd.DataFrame(data={'value':tumor_query_value,'tissue':['tumor']*tumor.shape[1]},index=tumor.columns)

    expr_tumor_dict = tumor_sub_df['value'].to_dict()   # {sample:value}
    expr_tumor_dict = {sample + ',' + 'tumor': value for sample,value in expr_tumor_dict.items()}  # {sample,tumor:value}
    expr_tumor_dict = {k:v for k,v in sorted(expr_tumor_dict.items(),key=lambda x:x[1])}
    expr_gtex_df = df_gtex  # index is sample name, two columns: value and tissue
    expr_gtex_dict = {row.Index + ',' + row.tissue: row.value for row in expr_gtex_df.itertuples()}   # {sample,tissue:value}
    expr_gtex_dict = {k:v for k,v in sorted(expr_gtex_dict.items(),key=lambda x:x[1])}
    node_x = []
    node_y = []
    node_text = []
    expr_gtex_dict.update(expr_tumor_dict)
    for i,(k,v) in enumerate(expr_gtex_dict.items()):
        node_x.append(i)
        node_y.append(v)
        node_text.append(k)
    node_trace = go.Scatter(x=node_x,y=node_y,mode='markers',marker={'color':'red','size':2},text=node_text,hoverinfo='text')
    fig = go.Figure(data=[node_trace],layout=go.Layout(showlegend=False))
    fig.update_xaxes(title_text='Samples(Normal -> Tumor)')
    y_axis_title = 'Count Per Million (Normalized)' if norm else 'Raw Read Count'
    fig.update_yaxes(title_text=y_axis_title)
    fig.write_html(os.path.join(outdir,'gtex_visual_combine_plotly_norm_{}_{}.html'.format(norm,identifier)))


[docs]def gtex_visual_combine_barplot(uid,norm=False,outdir='.',ylim=None,facecolor='#D99066',figsize=(6.4,4.8)):
    '''
    Shows the expression per tissue as bar plot

    :param uid: string, the uid of junction or gene
    :param norm: boolean, whether to normalize the raw count or not
    :param outdir: string, the output directory
    :param ylim: None or tuple, the lim of the y-axis
    :param facecolor: string, the facecolor of the barplot, recommended: #D99066, #50BF80 #795D8C
    :param figsize: tuple, the size of the figure

    Examples::

        gtex_visual_combine_barplot(uid=uid1,ylim=ylim,facecolor='#D99066',figsize=figsize,norm=norm,outdir=outdir)
        
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    query = uid
    try:
        info = adata[[query],:]
    except:
        print('{} not detected in gtex, impute as zero'.format(query))
        info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var)  # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate
    title = query
    identifier = query.replace(':','_')
    df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names)
    if norm:
        df['value'] = df['value'] / (info.var['total_count'])
    tmp = []
    for tissue,sub_df in df.groupby(by='tissue'):
        tmp.append((sub_df,sub_df['value'].mean(),tissue))
    fig,ax = plt.subplots(figsize=figsize)
    ax.bar(x=np.arange(len(tmp)),height=[item[1] for item in tmp],color=facecolor)
    ax.set_xticks(ticks=np.arange(len(tmp)),labels=[item[2] for item in tmp],fontsize=1,rotation=90)
    ax.set_xlabel('Tissues')
    if not norm:
        ax.set_ylabel('Raw Average Counts')
    else:
        ax.set_ylabel('Normalized read counts (CPM)')
    if ylim is not None:
        ax.set_ylim(ylim)
    plt.savefig(os.path.join(outdir,'gtex_visual_combine_barplot_norm_{}_{}.pdf'.format(norm,identifier)),bbox_inches='tight')
    plt.close()

[docs]def gtex_visual_combine(uid,norm=False,outdir='.',figsize=(6.4,4.8),tumor=None,ylim=None,group_by_tissue=True):
    ''' 
    Visualize the gtex expression and tumor specificity for splicing event (combine into one plot)

    :param uid: string, the uid for the splicing event
    :param norm: bool. whether normalize for the sequencing depth or not
    :param outdir: string, where the figure go into
    :param figsize: tuple, the (width,height) of the figure
    :param tumor: pandas dataframe, the tumor df to compare with
    :param ylim: tuple, modify the ylim of the (bottom, top) of the figure
    :param group_by_tissue: bool, whether to group the normal smaples by tissue or not, default is True

    Example::

        snaf.gtex_visual_combine(uid='ENSG00000090339:E4.3-E4.5',norm=True,tumor=df)  
        snaf.gtex_visual_combine(uid='ENSG00000112149:E7.1-E9.1',norm=True,tumor=df)
    '''
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    query = uid
    try:
        info = adata[[query],:]
    except:
        print('{} not detected in gtex, impute as zero'.format(query))
        info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var)  # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate
    title = query
    identifier = query.replace(':','_')
    df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names)
    if norm:
        df['value'] = df['value'] / (info.var['total_count'])
    tmp = []
    for tissue,sub_df in df.groupby(by='tissue'):
        tmp.append((sub_df,sub_df['value'].mean()))
    sorted_sub_df_list = list(list(zip(*sorted(tmp,key=lambda x:x[1])))[0])
    if tumor is not None:
        tumor_query_value = tumor.loc[query,:].values.squeeze()
        if norm:
            tumor_total_count = tumor.sum(axis=0)
            tumor_query_value = tumor_query_value / (tumor_total_count.values.squeeze()/1e6)
        tumor_sub_df = pd.DataFrame(data={'value':tumor_query_value,'tissue':['tumor']*tumor.shape[1]},index=tumor.columns)
        sorted_sub_df_list.append(tumor_sub_df)
    fig,ax = plt.subplots(figsize=figsize)
    x = 0
    x_list = []
    y_list = []
    v_delimiter = [0]
    xticklabel = []
    total_number_tissues = len(sorted_sub_df_list)
    if tumor is None:
        c_list = np.concatenate([np.array(['g']*sub_df.shape[0]) for sub_df in sorted_sub_df_list]).tolist()
    else:
        c_list_1 = np.concatenate([np.array(['g']*sub_df.shape[0]) for sub_df in sorted_sub_df_list[:-1]]).tolist()
        c_list_2 = ['r'] * sorted_sub_df_list[-1].shape[0]
        c_list = c_list_1 + c_list_2

    if not group_by_tissue:
        sorted_sub_df_list = [df,tumor_sub_df]

    for i,sub_df in enumerate(sorted_sub_df_list):
        sub_df.sort_values(by='value',inplace=True)
        n = sub_df.shape[0]
        xticklabel.append(sub_df['tissue'].iloc[0])
        for j,v in enumerate(sub_df['value']):
            x_list.append(x)
            y_list.append(v)
            x += 1
            if j == n-1:
                v_delimiter.append(x)
                x += 1
    ax.scatter(x_list,y_list,s=2,c=c_list,marker='o')
    for v in v_delimiter[1:-1]:
        ax.axvline(v,linestyle='--',linewidth=0.5)
    xtick = [(v + v_delimiter[i+1])/2 for i,v in enumerate(v_delimiter[:-1])]
    ax.set_xticks(xtick)
    ax.set_xticklabels(xticklabel,rotation=90,fontsize=1)
    
    ax.set_title(title)
    ylabel = 'Raw read counts'
    if norm:
        ylabel = 'Normalized read counts (CPM)'
    ax.set_ylabel(ylabel)
    ax.set_xlabel('Normal Tissues --> Tumor')
    if ylim is not None:
        ax.set_ylim(ylim)
    plt.savefig(os.path.join(outdir,'gtex_visual_combine_norm_{}_{}_groupbytissue_{}.pdf'.format(norm,identifier,group_by_tissue)),bbox_inches='tight')
    plt.close()

    return df
    






[docs]def gtex_visual_subplots(uid,norm=True,top=100,outdir='.'):
    ''' 
    Visualize the gtex expression and tumor specificity for splicing event (subplots)

    :param uid: string, the uid for the splicing event
    :param norm: bool. whether normalize for the sequencing depth or not
    :param outdir: string, where the figure go into

    Example::

        snaf.gtex_visual_subplots(query='ENSG00000090339:E4.3-E4.5',norm=True)
        snaf.gtex_visual_subplots(query='ENSG00000112149:E7.1-E9.1',norm=True)
    '''
    query = uid
    try:
        info = adata[[query],:]
    except:
        print('{} not detected in gtex, impute as zero'.format(query))
        info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var)  # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate
    title = query
    identifier = query.replace(':','_')
    n_tissue = len(info.var['tissue'].unique())
    ncols = 5
    fig,axes = plt.subplots(figsize=(20,20),ncols=ncols,nrows=n_tissue//ncols+1,gridspec_kw={'wspace':0.5,'hspace':0.5})
    df_data = []
    for i,ax in enumerate(axes.flat):
        if i < n_tissue:
            tissue = info.var['tissue'].unique()[i]
            psi = info[:,info.var['tissue']==tissue].X.toarray().squeeze()
            non_zero_count = np.count_nonzero(psi)
            if norm:
                psi = psi / info[:,info.var['tissue']==tissue].var['total_count'].values
            try:
                ax.plot(np.arange(len(psi)),psi,marker='o',markersize=4,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--')
            except:
                psi = [psi]
                ax.plot(np.arange(len(psi)),psi,marker='o',markersize=4,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--')
            total_count = len(psi)
            ax.set_xticks(np.arange(len(psi)))
            ax.set_xticklabels(['s{}'.format(i) for i in np.arange(len(psi))],fontsize=4,rotation=60)
            ax.set_title('{}_count:{}/{}'.format(tissue,non_zero_count,total_count),fontsize=4)
            ax.set_ylim(bottom=-0.001,top=top)
            if norm:
                ax.set_ylabel('normalized counts')
            else:
                ax.set_ylabel('counts')
            # write the df
            df_data.append((tissue,total_count,non_zero_count,non_zero_count/total_count))
        else:
            ax.axis('off')
    df = pd.DataFrame.from_records(data=df_data,columns=['tissue','total_count','non_zero_count','ratio']).set_index(keys='tissue')
    df.to_csv(os.path.join(outdir,'tissue_specific_presence_{}.txt'.format(identifier)),sep='\t')
    fig.suptitle(title,fontsize=10)
    if norm:
        name = 'gtex_visual_subplots_norm_count_{}.pdf'.format(identifier)
    else:
        name = 'gtex_visual_subplots_count_{}.pdf'.format(identifier)
    plt.savefig(os.path.join(outdir,name),bbox_inches='tight')
    plt.close()