Source code for snaf.gtex_viewer


 

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
import matplotlib.pyplot as plt
import anndata as ad
import seaborn as sns
from scipy.sparse import csr_matrix


import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

# gtex viewer

def gtex_viewer_configuration(adata_passed_in):
    global adata
    adata = adata_passed_in

[docs]def gtex_visual_norm_count_combined(uid,xlim=None,ylim=None,save_df=False,outdir='.'): ''' This function is to visualize the normalized count (CPM) distribution, it should follow a half-norm distribution, this plot will help determine the choice in bayesian modeling for tumor specificity score :param uid: string, the uid for the splicing event :param xlim: None or tuple, whether to constrain the xlim of the histplot :param ylim: None or tuple, whether to constrain the ylim of the histplot :param save_df: boolean, whether to save a txt file reporting all the cpm values and the tissue information for debug purpose :param outdir: string, where the figure go into Example:: snaf.gtex_visual_norm_count_combined(uid='ENSG00000090339:E4.3-E4.5') ''' if not os.path.exists(outdir): os.mkdir(outdir) info = adata[[uid],:] scale_factor_dict = adata.var['total_count'].to_dict() df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names) df['value_cpm'] = df['value'].values / df.index.map(scale_factor_dict).values data = df['value_cpm'].values fig, ax = plt.subplots() sns.histplot(data,bins=100,ax=ax) if xlim is not None: ax.set_xlim(xlim) if ylim is not None: ax.set_ylim(ylim) ax.set_xlabel('Count Per Million(CPM)') plt.savefig(os.path.join(outdir,'hist_{}.pdf'.format(uid.replace(':','_'))),bbox_inches='tight') plt.close() if save_df: df.to_csv(os.path.join(outdir,'cpm_{}.txt'.format(uid.replace(':','_'))),sep='\t') return df
[docs]def gtex_visual_per_tissue_count(uid,total_count=10, count_cutoff=1, total=25, outdir='.'): ''' This function is to visualize the random variable X, which repesents the number of samples in each tissue type that express this junction. It should follow a zeroinflated poisson distribution. We do not consider tissues with less than total_count samples, and we use count_cutoff to represent how many count can be defined as "expressed" versus "non-expressed". We scaled all the number of samples to the total of "total" to make sure they are comparable. Because if a tissue type has 1000 samples, another has 25 samples, this random variable is not directly comparable. this plot will help determine the choice in bayesian modeling for tumor specificity score :param uid: string, the uid for the splicing event :param outdir: string, where the figure go into Example:: snaf.gtex_visual_per_tissue_count(uid='ENSG00000090339:E4.3-E4.5') ''' if not os.path.exists(outdir): os.mkdir(outdir) x = [] for tissue in adata.var['tissue'].unique(): sub = adata[uid,adata.var['tissue']==tissue] total_count = sub.shape[1] if total_count >= total_count: c = np.count_nonzero(np.where(sub.X.toarray()<count_cutoff,0,sub.X.toarray())) scaled_c = round(c * (total/total_count),0) x.append(scaled_c) x = np.array(x) fig,ax = plt.subplots() try: sns.histplot(x,binwidth=1,ax=ax) except: sns.histplot(x,ax=ax) # if x is with no variance at all, binwidth can not be 1 ax.set_xlabel('Number of samples expressing this target in each tissue type') plt.savefig(os.path.join(outdir,'tissue_dist_{}.pdf'.format(uid.replace(':','_'))),bbox_inches='tight') plt.close()
[docs]def gtex_visual_combine_plotly(uid,outdir='',norm=False,tumor=None): ''' Generate combined plot but interactively :param uid: string, the uid for the splicing event :param norm: bool. whether normalize for the sequencing depth or not :param outdir: string, where the figure go into :param tumor: pandas dataframe, the tumor df to compare with Example:: snaf.gtex_visual_combine(uid='ENSG00000090339:E4.3-E4.5',norm=True,tumor=df) snaf.gtex_visual_combine(uid='ENSG00000112149:E7.1-E9.1',norm=True,tumor=df) ''' if not os.path.exists(outdir): os.mkdir(outdir) import plotly.graph_objects as go query = uid try: info = adata[[query],:] except: print('{} not detected in gtex or added controls, impute as zero'.format(query)) info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var) # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate title = query identifier = query.replace(':','_') df_gtex = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names) if norm: df_gtex['value'] = df_gtex['value'] / (info.var['total_count']) if tumor is not None: tumor_query_value = tumor.loc[query,:].values.squeeze() if norm: tumor_total_count = tumor.sum(axis=0) tumor_query_value = tumor_query_value / (tumor_total_count.values.squeeze()/1e6) tumor_sub_df = pd.DataFrame(data={'value':tumor_query_value,'tissue':['tumor']*tumor.shape[1]},index=tumor.columns) expr_tumor_dict = tumor_sub_df['value'].to_dict() # {sample:value} expr_tumor_dict = {sample + ',' + 'tumor': value for sample,value in expr_tumor_dict.items()} # {sample,tumor:value} expr_tumor_dict = {k:v for k,v in sorted(expr_tumor_dict.items(),key=lambda x:x[1])} expr_gtex_df = df_gtex # index is sample name, two columns: value and tissue expr_gtex_dict = {row.Index + ',' + row.tissue: row.value for row in expr_gtex_df.itertuples()} # {sample,tissue:value} expr_gtex_dict = {k:v for k,v in sorted(expr_gtex_dict.items(),key=lambda x:x[1])} node_x = [] node_y = [] node_text = [] expr_gtex_dict.update(expr_tumor_dict) for i,(k,v) in enumerate(expr_gtex_dict.items()): node_x.append(i) node_y.append(v) node_text.append(k) node_trace = go.Scatter(x=node_x,y=node_y,mode='markers',marker={'color':'red','size':2},text=node_text,hoverinfo='text') fig = go.Figure(data=[node_trace],layout=go.Layout(showlegend=False)) fig.update_xaxes(title_text='Samples(Normal -> Tumor)') y_axis_title = 'Count Per Million (Normalized)' if norm else 'Raw Read Count' fig.update_yaxes(title_text=y_axis_title) fig.write_html(os.path.join(outdir,'gtex_visual_combine_plotly_norm_{}_{}.html'.format(norm,identifier)))
[docs]def gtex_visual_combine_barplot(uid,norm=False,outdir='.',ylim=None,facecolor='#D99066',figsize=(6.4,4.8)): ''' Shows the expression per tissue as bar plot :param uid: string, the uid of junction or gene :param norm: boolean, whether to normalize the raw count or not :param outdir: string, the output directory :param ylim: None or tuple, the lim of the y-axis :param facecolor: string, the facecolor of the barplot, recommended: #D99066, #50BF80 #795D8C :param figsize: tuple, the size of the figure Examples:: gtex_visual_combine_barplot(uid=uid1,ylim=ylim,facecolor='#D99066',figsize=figsize,norm=norm,outdir=outdir) ''' if not os.path.exists(outdir): os.mkdir(outdir) query = uid try: info = adata[[query],:] except: print('{} not detected in gtex, impute as zero'.format(query)) info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var) # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate title = query identifier = query.replace(':','_') df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names) if norm: df['value'] = df['value'] / (info.var['total_count']) tmp = [] for tissue,sub_df in df.groupby(by='tissue'): tmp.append((sub_df,sub_df['value'].mean(),tissue)) fig,ax = plt.subplots(figsize=figsize) ax.bar(x=np.arange(len(tmp)),height=[item[1] for item in tmp],color=facecolor) ax.set_xticks(ticks=np.arange(len(tmp)),labels=[item[2] for item in tmp],fontsize=1,rotation=90) ax.set_xlabel('Tissues') if not norm: ax.set_ylabel('Raw Average Counts') else: ax.set_ylabel('Normalized read counts (CPM)') if ylim is not None: ax.set_ylim(ylim) plt.savefig(os.path.join(outdir,'gtex_visual_combine_barplot_norm_{}_{}.pdf'.format(norm,identifier)),bbox_inches='tight') plt.close()
[docs]def gtex_visual_combine(uid,norm=False,outdir='.',figsize=(6.4,4.8),tumor=None,ylim=None,group_by_tissue=True): ''' Visualize the gtex expression and tumor specificity for splicing event (combine into one plot) :param uid: string, the uid for the splicing event :param norm: bool. whether normalize for the sequencing depth or not :param outdir: string, where the figure go into :param figsize: tuple, the (width,height) of the figure :param tumor: pandas dataframe, the tumor df to compare with :param ylim: tuple, modify the ylim of the (bottom, top) of the figure :param group_by_tissue: bool, whether to group the normal smaples by tissue or not, default is True Example:: snaf.gtex_visual_combine(uid='ENSG00000090339:E4.3-E4.5',norm=True,tumor=df) snaf.gtex_visual_combine(uid='ENSG00000112149:E7.1-E9.1',norm=True,tumor=df) ''' if not os.path.exists(outdir): os.mkdir(outdir) query = uid try: info = adata[[query],:] except: print('{} not detected in gtex, impute as zero'.format(query)) info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var) # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate title = query identifier = query.replace(':','_') df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names) if norm: df['value'] = df['value'] / (info.var['total_count']) tmp = [] for tissue,sub_df in df.groupby(by='tissue'): tmp.append((sub_df,sub_df['value'].mean())) sorted_sub_df_list = list(list(zip(*sorted(tmp,key=lambda x:x[1])))[0]) if tumor is not None: tumor_query_value = tumor.loc[query,:].values.squeeze() if norm: tumor_total_count = tumor.sum(axis=0) tumor_query_value = tumor_query_value / (tumor_total_count.values.squeeze()/1e6) tumor_sub_df = pd.DataFrame(data={'value':tumor_query_value,'tissue':['tumor']*tumor.shape[1]},index=tumor.columns) sorted_sub_df_list.append(tumor_sub_df) fig,ax = plt.subplots(figsize=figsize) x = 0 x_list = [] y_list = [] v_delimiter = [0] xticklabel = [] total_number_tissues = len(sorted_sub_df_list) if tumor is None: c_list = np.concatenate([np.array(['g']*sub_df.shape[0]) for sub_df in sorted_sub_df_list]).tolist() else: c_list_1 = np.concatenate([np.array(['g']*sub_df.shape[0]) for sub_df in sorted_sub_df_list[:-1]]).tolist() c_list_2 = ['r'] * sorted_sub_df_list[-1].shape[0] c_list = c_list_1 + c_list_2 if not group_by_tissue: sorted_sub_df_list = [df,tumor_sub_df] for i,sub_df in enumerate(sorted_sub_df_list): sub_df.sort_values(by='value',inplace=True) n = sub_df.shape[0] xticklabel.append(sub_df['tissue'].iloc[0]) for j,v in enumerate(sub_df['value']): x_list.append(x) y_list.append(v) x += 1 if j == n-1: v_delimiter.append(x) x += 1 ax.scatter(x_list,y_list,s=2,c=c_list,marker='o') for v in v_delimiter[1:-1]: ax.axvline(v,linestyle='--',linewidth=0.5) xtick = [(v + v_delimiter[i+1])/2 for i,v in enumerate(v_delimiter[:-1])] ax.set_xticks(xtick) ax.set_xticklabels(xticklabel,rotation=90,fontsize=1) ax.set_title(title) ylabel = 'Raw read counts' if norm: ylabel = 'Normalized read counts (CPM)' ax.set_ylabel(ylabel) ax.set_xlabel('Normal Tissues --> Tumor') if ylim is not None: ax.set_ylim(ylim) plt.savefig(os.path.join(outdir,'gtex_visual_combine_norm_{}_{}_groupbytissue_{}.pdf'.format(norm,identifier,group_by_tissue)),bbox_inches='tight') plt.close() return df
[docs]def gtex_visual_subplots(uid,norm=True,top=100,outdir='.'): ''' Visualize the gtex expression and tumor specificity for splicing event (subplots) :param uid: string, the uid for the splicing event :param norm: bool. whether normalize for the sequencing depth or not :param outdir: string, where the figure go into Example:: snaf.gtex_visual_subplots(query='ENSG00000090339:E4.3-E4.5',norm=True) snaf.gtex_visual_subplots(query='ENSG00000112149:E7.1-E9.1',norm=True) ''' query = uid try: info = adata[[query],:] except: print('{} not detected in gtex, impute as zero'.format(query)) info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var) # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate title = query identifier = query.replace(':','_') n_tissue = len(info.var['tissue'].unique()) ncols = 5 fig,axes = plt.subplots(figsize=(20,20),ncols=ncols,nrows=n_tissue//ncols+1,gridspec_kw={'wspace':0.5,'hspace':0.5}) df_data = [] for i,ax in enumerate(axes.flat): if i < n_tissue: tissue = info.var['tissue'].unique()[i] psi = info[:,info.var['tissue']==tissue].X.toarray().squeeze() non_zero_count = np.count_nonzero(psi) if norm: psi = psi / info[:,info.var['tissue']==tissue].var['total_count'].values try: ax.plot(np.arange(len(psi)),psi,marker='o',markersize=4,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--') except: psi = [psi] ax.plot(np.arange(len(psi)),psi,marker='o',markersize=4,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--') total_count = len(psi) ax.set_xticks(np.arange(len(psi))) ax.set_xticklabels(['s{}'.format(i) for i in np.arange(len(psi))],fontsize=4,rotation=60) ax.set_title('{}_count:{}/{}'.format(tissue,non_zero_count,total_count),fontsize=4) ax.set_ylim(bottom=-0.001,top=top) if norm: ax.set_ylabel('normalized counts') else: ax.set_ylabel('counts') # write the df df_data.append((tissue,total_count,non_zero_count,non_zero_count/total_count)) else: ax.axis('off') df = pd.DataFrame.from_records(data=df_data,columns=['tissue','total_count','non_zero_count','ratio']).set_index(keys='tissue') df.to_csv(os.path.join(outdir,'tissue_specific_presence_{}.txt'.format(identifier)),sep='\t') fig.suptitle(title,fontsize=10) if norm: name = 'gtex_visual_subplots_norm_count_{}.pdf'.format(identifier) else: name = 'gtex_visual_subplots_count_{}.pdf'.format(identifier) plt.savefig(os.path.join(outdir,name),bbox_inches='tight') plt.close()