Source code for snaf.gtex

#!/data/salomonis2/LabFiles/Frank-Li/refactor/neo_env/bin/python3.7

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
import matplotlib.pyplot as plt
import anndata as ad
from scipy.optimize import minimize, minimize_scalar
from scipy import stats
from scipy.sparse import csr_matrix, find
from tqdm import tqdm
import re

# try:
#     import pymc3 as pm   # conda install -c conda-forge pymc3 mkl-service
#     import theano
#     import arviz as az
# except ImportError:
#     print('''
#         Optional package pymc3 is not installed, it is for calculating tumor specificity using hirerarchical bayesian model
#         For Linux: https://github.com/pymc-devs/pymc/wiki/Installation-Guide-(Linux)
#         For MacOS: https://github.com/pymc-devs/pymc/wiki/Installation-Guide-(MacOS)
#         For PC:    https://github.com/pymc-devs/pymc/wiki/Installation-Guide-(Windows)
#     ''')

# '''
# this script is to query the tumor specificity of the junction
# '''


def gtex_configuration(df,gtex_db,t_min_arg,n_max_arg,normal_cutoff_arg,tumor_cutoff_arg,normal_prevalance_cutoff_arg,tumor_prevalance_cutoff_arg,add_control=None):
    global adata_gtex
    global adata
    global t_min
    global n_max
    global normal_cutoff
    global tumor_cutoff
    global normal_prevalance_cutoff
    global tumor_prevalance_cutoff
    tested_junctions = set(df.index)
    adata = ad.read_h5ad(gtex_db)
    adata = adata[np.logical_not(adata.obs_names.duplicated()),:] 
    adata = adata[list(set(adata.obs_names).intersection(tested_junctions)),:]  
    print('Current loaded gtex cohort with shape {}'.format(adata.shape))
    tissue_dict = adata.var['tissue'].to_dict()
    adata_gtex = adata   # already has mean and tissue variables
    if add_control is not None:
        for id_, control in add_control.items():
            if isinstance(control,pd.DataFrame):
                assert len(set(control.columns).intersection(tissue_dict.keys())) == 0  # sample id can not be ambiguous
                control = control.loc[np.logical_not(control.index.duplicated()),:]
                control = control.loc[list(set(control.index).intersection(tested_junctions)),:]
                print('Adding cohort {} with shape {} to the database'.format(id_,control.shape))
                tissue_dict_right = {k:id_ for k in control.columns}
                tissue_dict.update(tissue_dict_right)
                df_left = adata.to_df()
                df_right = control
                df_combine = df_left.join(other=df_right,how='outer').fillna(0)
                adata = ad.AnnData(X=csr_matrix(df_combine.values),obs=pd.DataFrame(index=df_combine.index),var=pd.DataFrame(index=df_combine.columns))

            elif isinstance(control,ad.AnnData):
                assert len(set(control.var_names).intersection(tissue_dict.keys())) == 0
                control = control[np.logical_not(control.obs_names.duplicated()),:]
                control = control[list(set(control.obs_names).intersection(tested_junctions)),:]
                print('Adding cohort {} with shape {} to the database'.format(id_,control.shape))
                if 'tissue' in control.var.columns:   # if tissue is in var columns, it will be used 
                    tissue_dict_right = control.var['tissue'].to_dict()
                else:
                    tissue_dict_right = {k:id_ for k in control.var_names}
                tissue_dict.update(tissue_dict_right)
                df_left = adata.to_df()
                df_right = control.to_df()
                df_combine = df_left.join(other=df_right,how='outer').fillna(0)
                adata = ad.AnnData(X=csr_matrix(df_combine.values),obs=pd.DataFrame(index=df_combine.index),var=pd.DataFrame(index=df_combine.columns))
            
            else:
                raise Exception('control must be either in dataframe or anndata format')

            print('now the shape of control db is {}'.format(adata.shape))
            adata.var['tissue'] = adata.var_names.map(tissue_dict).values
            adata.obs['mean'] = np.array(adata.X.mean(axis=1)).squeeze()
            total_count = np.array(adata.X.sum(axis=0)).squeeze() / 1e6
            adata.var['total_count'] = total_count
            

    t_min = t_min_arg
    n_max = n_max_arg
    normal_cutoff = normal_cutoff_arg
    tumor_cutoff = tumor_cutoff_arg
    normal_prevalance_cutoff = normal_prevalance_cutoff_arg
    tumor_prevalance_cutoff = tumor_prevalance_cutoff_arg

    return adata


[docs]def get_all_normal_h5ad(uids,outdir,name): ''' Get the normal tissue expression as a h5ad file, so that you can run BayesTS analysis (https://github.com/frankligy/BayesTS#interface-with-snaf) :param uids: list, all the uids you'd like to query :param outdir: string, the output directory :param name: string, the h5ad name Example:: get_all_normal_h5ad(uids=uids,outdir='.',name='junction') ''' adata_new = adata[uids,:] print(adata_new) adata_new.write(os.path.join(outdir,'{}.h5ad'))
def multiple_crude_sifting(junction_count_matrix,add_control,dict_exonlist,outdir,filter_mode): if filter_mode == 'prevalance': valid,invalid,cond_df = multiple_crude_sifting_prevalance(junction_count_matrix,add_control,dict_exonlist,outdir) elif filter_mode == 'maxmin': valid,invalid,cond_df = multiple_crude_sifting_maxmin(junction_count_matrix,add_control,dict_exonlist,outdir) return valid,invalid, cond_df def multiple_crude_sifting_prevalance(junction_count_matrix,add_control=None,dict_exonlist=None,outdir='.'): if not os.path.exists(outdir): os.mkdir(outdir) df_to_write = [] df = pd.DataFrame(index=junction_count_matrix.index) prevalance_tumor = np.count_nonzero((junction_count_matrix > tumor_cutoff).values,axis=1) / junction_count_matrix.shape[1] df['prevalance_tumor'] = prevalance_tumor # consider gtex prevalance_normal = np.count_nonzero((adata_gtex.X > normal_cutoff).toarray(),axis=1) / adata_gtex.shape[1] prevalance_normal_dict = {j:v for j,v in zip(adata_gtex.obs_names,prevalance_normal)} df['prevalance_normal'] = df.index.map(prevalance_normal_dict).fillna(value=0) df['cond'] = (df['prevalance_tumor'] > tumor_prevalance_cutoff) & (df['prevalance_normal'] < normal_prevalance_cutoff) valid = df.loc[df['cond']].index.tolist() tmp = df.copy() df_to_write.append(tmp) print('reduce valid NeoJunction from {} to {} because they are present in GTEx'.format(df.shape[0],len(valid))) if dict_exonlist is not None: # a valid junction can not be present in any ensembl documented transcript updated_valid = [] for uid in tqdm(valid): ensg = uid.split(':')[0] exons = ':'.join(uid.split(':')[1:]) if '_' in exons or 'U' in exons or 'ENSG' in exons or 'I' in exons: updated_valid.append(uid) else: exonlist = dict_exonlist[ensg] exonstring = '|'.join(exonlist) e1,e2 = exons.split('-') pattern1 = re.compile(r'^{}\|{}\|'.format(e1,e2)) # ^E1.1|E2.3| pattern2 = re.compile(r'\|{}\|{}$'.format(e1,e2)) # |E1.1|E2.3$ pattern3 = re.compile(r'\|{}\|{}\|'.format(e1,e2)) # |E1.1|E2.3| if re.search(pattern3,exonstring) or re.search(pattern2,exonstring) or re.search(pattern1,exonstring): # as long as match one pattern, should be eliminated continue else: updated_valid.append(uid) print('reduce valid Neojunction from {} to {} because they are present in Ensembl db'.format(len(valid),len(updated_valid))) valid = updated_valid # consider add_control if add_control is not None: for i,(id_,control) in enumerate(add_control.items()): n_previous_valid = len(valid) if isinstance(control,pd.DataFrame): prevalance_normal = np.count_nonzero((control > normal_cutoff).values,axis=1) / control.shape[1] prevalance_normal_dict = {j:v for j,v in zip(control.index, prevalance_normal)} elif isinstance(control,ad.AnnData): prevalance_normal = np.count_nonzero((control.X > normal_cutoff).toarray(),axis=1) / control.shape[1] prevalance_normal_dict = {j:v for j,v in zip(control.obs_names,prevalance_normal)} else: raise Exception('control must be either in dataframe or anndata format') df['prevalance_normal_add'] = df.index.map(prevalance_normal_dict).fillna(value=0) df['cond_add'] = (df['prevalance_tumor'] > tumor_prevalance_cutoff) & (df['prevalance_normal_add'] < normal_prevalance_cutoff) valid_add = df.loc[df['cond_add']].index.tolist() valid = list(set(valid).intersection(set(valid_add))) tmp = df.copy(); tmp.drop(columns=['prevalance_tumor','prevalance_normal','cond'],inplace=True); tmp.rename(columns=lambda x:x+'_{}'.format(id_),inplace=True) df_to_write.append(tmp) print('reduce valid Neojunction from {} to {} because they are present in added control {}'.format(n_previous_valid,len(valid),id_)) invalid = list(set(junction_count_matrix.index).difference(set(valid))) # now, consider each entry t_min = tumor_cutoff valid_set = set(valid) cond_dict = {j:(True if j in valid_set else False) for j in junction_count_matrix.index} tmp = pd.DataFrame(index=junction_count_matrix.index,data={'placeholder':junction_count_matrix.index.map(cond_dict).values}) first_half_cond_df = pd.concat([tmp]*junction_count_matrix.shape[1],axis=1) first_half_cond_df.columns = junction_count_matrix.columns cond_df = (first_half_cond_df) & (junction_count_matrix > t_min) # write the df df_to_write = pd.concat(df_to_write,axis=1) df_to_write.to_csv(os.path.join(outdir,'NeoJunction_statistics_prevalance.txt'),sep='\t') return valid,invalid,cond_df def multiple_crude_sifting_maxmin(junction_count_matrix,add_control=None,dict_exonlist=None,outdir='.'): # for JunctionCountMatrixQuery class, only consider gtex if not os.path.exists(outdir): os.mkdir(outdir) df = pd.DataFrame(index=junction_count_matrix.index,data = {'max':junction_count_matrix.max(axis=1).values}) df_to_write = [] # consider gtex junction_to_mean = adata_gtex.obs.loc[adata_gtex.obs_names.isin(junction_count_matrix.index),'mean'].to_dict() df['mean'] = df.index.map(junction_to_mean).fillna(value=0) df['diff'] = df['max'] - df['mean'] df['cond'] = (df['mean'] < n_max) & (df['diff'] > t_min) valid = df.loc[df['cond']].index.tolist() tmp = df.copy() df_to_write.append(tmp) print('reduce valid NeoJunction from {} to {} because they are present in GTEx'.format(df.shape[0],len(valid))) if dict_exonlist is not None: # a valid junction can not be present in any ensembl documented transcript updated_valid = [] for uid in tqdm(valid): ensg = uid.split(':')[0] exons = ':'.join(uid.split(':')[1:]) if '_' in exons or 'U' in exons or 'ENSG' in exons or 'I' in exons: updated_valid.append(uid) else: exonlist = dict_exonlist[ensg] exonstring = '|'.join(exonlist) e1,e2 = exons.split('-') pattern1 = re.compile(r'^{}\|{}\|'.format(e1,e2)) # ^E1.1|E2.3| pattern2 = re.compile(r'\|{}\|{}$'.format(e1,e2)) # |E1.1|E2.3$ pattern3 = re.compile(r'\|{}\|{}\|'.format(e1,e2)) # |E1.1|E2.3| if re.search(pattern3,exonstring) or re.search(pattern2,exonstring) or re.search(pattern1,exonstring): # as long as match one pattern, should be eliminated continue else: updated_valid.append(uid) print('reduce valid Neojunction from {} to {} because they are present in Ensembl db'.format(len(valid),len(updated_valid))) valid = updated_valid # consider add_control mean_add_list = [] if add_control is not None: for i,(id_,control) in enumerate(add_control.items()): n_previous_valid = len(valid) if isinstance(control,pd.DataFrame): junction_to_mean = control.mean(axis=1).to_dict() elif isinstance(control,ad.AnnData): junction_to_mean = control.to_df().mean(axis=1).to_dict() else: raise Exception('control must be either in dataframe or anndata format') df['mean_add'] = df.index.map(junction_to_mean).fillna(value=0) df['diff_add'] = df['max'] - df['mean_add'] df['cond_add'] = (df['mean_add'] < n_max) & (df['diff_add'] > t_min) mean_add_list.append(df['mean_add']) valid_add = df.loc[df['cond_add']].index.tolist() valid = list(set(valid).intersection(set(valid_add))) tmp = df.copy(); tmp.drop(columns=['mean','diff','cond'],inplace=True); tmp.rename(columns=lambda x:x+'_{}'.format(id_),inplace=True) df_to_write.append(tmp) print('reduce valid Neojunction from {} to {} because they are present in added control {}'.format(n_previous_valid,len(valid),id_)) invalid = list(set(junction_count_matrix.index).difference(set(valid))) # now, consider each entry gtex_df = pd.concat([df['mean']]*junction_count_matrix.shape[1],axis=1) gtex_df.columns = junction_count_matrix.columns diff_df_gtex = junction_count_matrix - gtex_df cond_df = (gtex_df < n_max) & (diff_df_gtex > t_min) if add_control is not None: for mean_add in mean_add_list: add_df = pd.concat([mean_add]*junction_count_matrix.shape[1],axis=1) add_df.columns = junction_count_matrix.columns diff_df_add = junction_count_matrix - add_df cond_df = cond_df & (add_df < n_max) & (diff_df_add > t_min) # write the df df_to_write = pd.concat(df_to_write,axis=1) df_to_write.to_csv(os.path.join(outdir,'NeoJunction_statistics_maxmin.txt'),sep='\t') return valid,invalid,cond_df def crude_tumor_specificity(uid,count): # for NeoJunction class, since we normally start from Jcmq with check_gtex=False, rarely being called. detail = '' if uid not in set(adata.obs_names): mean_value = 0 else: mean_value = adata.obs.loc[uid,'mean'] diff = count - mean_value if mean_value < n_max and diff >= t_min: identity = True else: identity = False return identity,mean_value def mle_func(parameters,y): sigma = parameters ll = np.sum(stats.halfnorm.logpdf(y,0,sigma)) neg_ll = -1 * ll return neg_ll def split_df_to_chunks(df,cores=None): df_index = np.arange(df.shape[0]) if cores is None: cores = mp.cpu_count() sub_indices = np.array_split(df_index,cores) sub_dfs = [df.iloc[sub_index,:] for sub_index in sub_indices] return sub_dfs def split_array_to_chunks(array,cores=None): if not isinstance(array,list): raise Exception('split_array_to_chunks function works for list, not ndarray') array_index = np.arange(len(array)) if cores is None: cores = mp.cpu_count() sub_indices = np.array_split(array_index,cores) sub_arrays = [] for sub_index in sub_indices: item_in_group = [] for i in sub_index: item_in_group.append(array[i]) sub_arrays.append(item_in_group) return sub_arrays
[docs]def add_tumor_specificity_frequency_table(df,method='mean',remove_quote=True,cores=None): ''' add tumor specificty to each neoantigen-uid in the frequency table produced by SNAF T pipeline :param df: DataFrame, the frequency table produced by SNAF T pipeline :param method: string, either 'mean', or 'mle', or 'bayesian' :param remove quote: boolean, whether to remove the quotation or not, as one column in frequency table df is list, when loaded in memory using pandas, it will be added a quote, we can remove it :param cores: int, how many cpu cores to use for this computation, default None and use all the cpu the program detected :return new_df: a dataframe with one added column containing tumor specificity score Example:: snaf.add_tumor_specificity_frequency_table(df,'mle',remove_quote=True) ''' from ast import literal_eval import multiprocessing as mp if remove_quote: df['samples'] = [literal_eval(item) for item in df['samples']] if cores is None: cores = mp.cpu_count() if method != 'bayesian': pool = mp.Pool(processes=cores) print('{} subprocesses have been spawned'.format(cores)) all_unique_junctions = list(set([item.split(',')[1] for item in df.index])) sub_arrays = split_array_to_chunks(all_unique_junctions,cores=cores) r = [pool.apply_async(func=add_tumor_specificity_frequency_table_atomic_func,args=(sub_array,method,)) for sub_array in sub_arrays] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.append(result) all_score_dict = {} for score_dict in results: all_score_dict.update(score_dict) col = [] for item in df.index: col.append(all_score_dict[item.split(',')[1]]) new_df = df.copy() new_df['tumor_specificity_{}'.format(method)] = col else: # seems like bayesian doesn't work well with multiprocessing all_unique_junctions = list(set([item.split(',')[1] for item in df.index])) score_dict = {} for uid in tqdm(all_unique_junctions,total=len(all_unique_junctions)): score_dict[uid] = tumor_specificity(uid,'bayesian') for item in df.index: col.append(score_dict[item.split(',')[1]]) new_df = df.copy() new_df['tumor_specificity_{}'.format(method)] = col return new_df
def add_tumor_specificity_frequency_table_atomic_func(sub_array,method): uid_list = sub_array score_dict = {uid:tumor_specificity(uid,method) for uid in tqdm(uid_list,total=len(uid_list))} return score_dict def tumor_specificity(uid,method,return_df=False): try: info = adata[[uid],:] except: print('{} not detected in gtex, impute as zero'.format(uid)) info = ad.AnnData(X=csr_matrix(np.full((1,adata.shape[1]),0)),obs=pd.DataFrame(data={'mean':[0]},index=[uid]),var=adata.var) # weired , anndata 0.7.6 can not modify the X in place? anndata 0.7.2 can do that in scTriangulate df = pd.DataFrame(data={'value':info.X.toarray().squeeze(),'tissue':info.var['tissue'].values},index=info.var_names) if method == 'mean': try: sigma = adata.obs.loc[uid,'mean'] except KeyError: sigma = 0 if return_df: return sigma,df else: return sigma elif method == 'mle': scale_factor_dict = adata.var['total_count'].to_dict() df['value_cpm'] = df['value'].values / df.index.map(scale_factor_dict).values y = df['value_cpm'].values # mle_model = minimize(mle_func,np.array([0.2]),args=(y,),bounds=((0,1),),method='Nelder-Mead') mle_model = minimize_scalar(mle_func,bounds=(0,1),args=(y,),method='bounded') if mle_model.success: sigma = mle_model.x else: sigma = 0 print(uid,y, mle_model) # debug purpose if return_df: return sigma,df else: return sigma elif method == 'bayesian': scale_factor_dict = adata.var['total_count'].to_dict() df['value_cpm'] = df['value'].values / df.index.map(scale_factor_dict).values y = df['value_cpm'].values x = [] for tissue in adata.var['tissue'].unique(): sub = adata[uid,adata.var['tissue']==tissue] total_count = sub.shape[1] c = np.count_nonzero(sub.X.toarray()) scaled_c = round(c * (25/total_count),0) x.append(scaled_c) x = np.array(x) try: with pm.Model() as m: sigma = pm.Uniform('sigma',lower=0,upper=1) nc = pm.HalfNormal('nc',sigma=sigma,observed=y) nc_hat = pm.Deterministic('nc_hat',pm.math.sum(nc)/len(y)) psi = pm.Beta('psi',alpha=2,beta=nc_hat*20) mu = pm.Gamma('mu',alpha=nc_hat*50,beta=1) c = pm.ZeroInflatedPoisson('c',psi,mu,observed=x) trace = pm.sample(draws=1000,step=pm.NUTS(target_accept=0.95),tune=1000,return_inferencedata=False,cores=1) ''' the error of "Got error No model on context stack. trying to find log_likelihood in translation" maybe due to pymc build and how they launch multi-cores. remember, my build can only work when cores=1, which further indicate there might be an issue revolving around it. https://stackoverflow.com/questions/69888492/sampling-of-pymc3-in-python-gets-runtime-error-of-bootstrapping-phase ''' df = az.summary(trace,round_to=2) ''' az.summary(trace) mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat sigma 0.47 0.01 0.46 0.48 0.00 0.00 182.23 98.84 1.02 nc_hat 0.22 0.00 0.22 0.22 0.00 0.00 200.00 200.00 NaN mu 22.97 0.52 21.87 23.87 0.04 0.03 196.69 94.84 1.00 az.plot_posterior(trace,var_names=['sigma','nc_hat','mu']) az.plot_forest(trace,,var_names=['sigma','nc_hat','mu']) gv = pm.model_to_graphviz(m) gv.format = 'pdf' gv.render(filename='model_graph');sys.exit('stop') # to run the above, you need to module load graphviz so that dot is exposed to the program ''' sigma = df.iloc[0]['mean'] except: sigma = None print(uid,x,y) if return_df: return sigma,df else: return sigma