Source code for snaf.snaf

#!/data/salomonis2/LabFiles/Frank-Li/refactor/neo_env/bin/python3.7

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
from functools import reduce
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib as mpl
import anndata as ad
import seaborn as sns
from scipy import stats
from scipy.optimize import minimize
import re
import requests
import xmltodict
import multiprocessing as mp
import functools
from tqdm import tqdm
from itertools import compress
from ast import literal_eval
from bisect import bisect, bisect_left


# for biopython, pip install biopython
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.Seq import Seq

# other modules
from .binding import run_netMHCpan,run_MHCflurry
from .deepimmuno import run_deepimmuno
from .data_io import *
from .visualize import *
from .gtex import *
from .gtex_viewer import *
from .downstream import *


import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'


# configuration
def snaf_configuration(exon_table,transcript_db,db_dir,fasta,software_path_arg=None,binding_method_arg=None):
    global dict_exonCoords
    global dict_fa
    global software_path
    global binding_method
    global dict_exonlist
    global dict_start_codon
    global phase_inferer_gtf_dict
    dict_exonCoords = exonCoords_to_dict(exon_table)
    dict_exonlist = construct_dict_exonlist(transcript_db)
    dict_fa = fasta_to_dict(fasta)
    software_path = software_path_arg
    binding_method = binding_method_arg

    df_start_codon = pd.read_csv(os.path.join(db_dir,'Alt91_db','df_start_codon.txt'),sep='\t',index_col=0)
    df_start_codon['start_codon'] = [literal_eval(item) for item in df_start_codon['start_codon']]
    df_start_codon['non_redundant'] = [literal_eval(item) for item in df_start_codon['non_redundant']]
    dict_start_codon = df_start_codon['start_codon'].to_dict()

    phase_inferer_gtf_dict = process_gtf(os.path.join(db_dir,'Alt91_db','Homo_sapiens.GRCh38.91.gtf'))

    


def process_gtf(gtf):
    # any official ensembl release gtf format
    '''
    gtf_dict['ENSG00000186092']

        {'ENST00000641515': [(65419, 65433), (65520, 65573), (69037, 71585)],
        'ENST00000335137': [(69055, 70108)]}
    '''
    gtf_dict = {}
    with open(gtf,'r') as f:
        for line in f:
            try:
                chrom, source, typ, start, end, score, strand, phase, attrs = line.rstrip('\n').split('\t')
            except ValueError:  # the header line
                continue
            if typ == 'gene':
                ensg = attrs.split(';')[0].split(' ')[1].strip('"')
                gtf_dict[ensg] = {}
            elif typ == 'transcript':
                enst = attrs.split(';')[2].split(' ')[2].strip('"')
                gtf_dict[ensg][enst] = []
            elif typ == 'exon':
                gtf_dict[ensg][enst].append((int(start),int(end)))
    return gtf_dict

def get_support_phase(ensg, coord_first_exon_last_base, pssc, strand, length_first):
    coord_first_exon_last_base, pssc, length_first = int(coord_first_exon_last_base), int(pssc), int(length_first)
    all_trans = phase_inferer_gtf_dict[ensg]
    supports = []
    if strand == '+':
        for enst, tran in all_trans.items():
            lis = []
            [lis.extend(list(exon)) for exon in tran]
            pssc_insert_pos = bisect(lis, pssc)
            coord_first_exon_first_base = coord_first_exon_last_base - length_first + 1
            junction_insert_pos = bisect(lis, coord_first_exon_first_base)

            if junction_insert_pos % 2 == 1 and pssc_insert_pos % 2 == 1:
                if junction_insert_pos == pssc_insert_pos:
                    n_bases = coord_first_exon_first_base - pssc + 1
                elif junction_insert_pos > pssc_insert_pos:
                    start_exon_index = (pssc_insert_pos - 1) // 2
                    end_exon_index = (junction_insert_pos - 1) // 2
                    n_bases = 0
                    for i, exon in enumerate(tran):
                        if i == start_exon_index:
                            n_bases += (exon[1] - pssc + 1)
                        elif i == end_exon_index:
                            n_bases += (coord_first_exon_first_base - exon[0] + 1)
                        else:
                            if i > start_exon_index and i < end_exon_index:
                                n_bases += (exon[1] - exon[0] + 1)
                else:
                    continue
            else:
                continue


            remainder = n_bases % 3
            '''
            *   *   *   *   |*   *   *    *    *
                            |6   7   8    9    10

            remainder means a fragment including the first base in the first exon, how many left. 1 left means 6 should be the first base of new codon
            2 left means 6 should be 6 is the second in last codon, 7 will be the last base in the last codon, have to start with 8
            0 left means 6 should be the last base in the last codon, so that we start with 7
            '''
            if remainder == 1:
                phase = 0
            elif remainder == 2:
                phase = 2
            elif remainder == 0:
                phase = 1
            supports.append((phase, pssc, enst,strand))

    elif strand == '-':
        for enst, tran in all_trans.items():
            tran.sort(
                key=lambda x: x[0])  # because negative strand looks like [(5,7),(2,4)], we change it to [(2,4),(5,7)]
            lis = []
            [lis.extend(list(exon)) for exon in tran]
            pssc_insert_pos = bisect_left(lis, pssc)
            coord_first_exon_first_base = coord_first_exon_last_base + length_first - 1
            junction_insert_pos = bisect_left(lis, coord_first_exon_first_base)

            if junction_insert_pos % 2 == 1 and pssc_insert_pos % 2 == 1:
                if junction_insert_pos == pssc_insert_pos:
                    n_bases = pssc - coord_first_exon_first_base + 1
                elif junction_insert_pos < pssc_insert_pos:
                    start_exon_index = (pssc_insert_pos - 1) // 2
                    end_exon_index = (junction_insert_pos - 1) // 2
                    n_bases = 0
                    for i, exon in enumerate(tran):
                        if i == start_exon_index:
                            n_bases += (pssc - exon[0] + 1)
                        elif i == end_exon_index:
                            n_bases += (exon[1] - coord_first_exon_first_base + 1)
                        else:
                            if i > end_exon_index and i < start_exon_index:
                                n_bases += (exon[1] - exon[0] + 1)
                else:
                    continue
            else:
                continue



            remainder = n_bases % 3  # same idea as above forward strand, see explanation above
            if remainder == 1:
                phase = 0
            elif remainder == 2:
                phase = 2
            elif remainder == 0:
                phase = 1
            supports.append((phase, pssc, enst, strand))

    return supports   



    
def is_in_db(valid):
    # provide a list with uid without repeats
    mapping = {}
    for uid in tqdm(valid):
        ensg = uid.split(':')[0]
        exons = ':'.join(uid.split(':')[1:])
        if '_' in exons or 'U' in exons or 'ENSG' in exons or 'I' in exons:
            mapping[uid] = False
        else:
            exonlist = dict_exonlist[ensg]
            exonstring = '|'.join(exonlist)
            e1,e2 = exons.split('-')
            pattern1 = re.compile(r'^{}\|{}\|'.format(e1,e2))  # ^E1.1|E2.3|
            pattern2 = re.compile(r'\|{}\|{}$'.format(e1,e2))  # |E1.1|E2.3$
            pattern3 = re.compile(r'\|{}\|{}\|'.format(e1,e2)) # |E1.1|E2.3|
            if re.search(pattern3,exonstring) or re.search(pattern2,exonstring) or re.search(pattern1,exonstring):   # as long as match one pattern, should be eliminated
                mapping[uid] = True
            else:
                mapping[uid] = False
    return mapping








[docs]class JunctionCountMatrixQuery(): ''' Instantiate the JunctionCountMatrixQuery class :param junction_count_matrix: The pandas dataframe for the junction_count_matrix, outputted by AltAnalyze :param cores: int, how many cores you'd like to use, if None, then will automatically detect the maximum amount of cores in the OS :param add_control: None or a dictionary containing additional controls, additional controls can a dataframe or anndata, for instance, if adding two controls asides from internal GTEx, {'tcga_matched_control':df,'gtex_skin':adata}, when added database contains multiple tissue types, it is recommended to pass as a AnnData with the tissue information stored as adata.var['tissue'], otherwise, if passed as a dataframe, or samples will be recognized as a single tissue type, it will affect tumor specifcity score calculation if using MLE or bayesian hierarchical models. :param not_in_db: boolean, whether to remove junctions that are present in any Ensembl documented transcript, remember some of the documented transcript in Ensembl are tumor-specific as well, doing so may remove some bona fide hits. But good for reducing number of neoantigens for validation. :param outdir: string, the output folder for storing results :param filter_mode: string, either 'maxmin' or 'prevalance', this correspond to the parameters that will be used in snaf.initialize, if maxmin, use t_min and n_max, if prevalance, use normal and tumor cutoff Example:: jcmq = JunctionCountMatrixQuery(junction_count_matrix=df,cores=50,add_control=control_df,not_in_db=True) ''' def __init__(self,junction_count_matrix,cores=None,add_control=None,not_in_db=False,outdir='.',filter_mode='maxmin'): self.junction_count_matrix = junction_count_matrix if cores is None: cores = mp.cpu_count() self.cores = cores if not_in_db: self.get_neojunctions(add_control=add_control,dict_exonlist=dict_exonlist,outdir=outdir,filter_mode=filter_mode) else: self.get_neojunctions(add_control=add_control,dict_exonlist=None,outdir=outdir,filter_mode=filter_mode) def __str__(self): try: len_translated = len(self.translated) except AttributeError: len_translated = None try: shape_cond_subset_df = self.cond_subset_df.shape except AttributeError: shape_cond_subset_df = None try: len_results = len(self.results) except AttributeError: len_results = None return 'junction_count_matrix: {}\n'\ 'cores: {}\n'\ 'valid: {}\n'\ 'invalid: {}\n'\ 'cond_df: {}\n'\ 'subset: {}\n'\ 'translated: list of {} nj objects\n'\ 'cond_subset_df: {}\n'\ 'results: list of length {}'.format(self.junction_count_matrix.shape,self.cores,len(self.valid),len(self.invalid), self.cond_df.shape,self.subset.shape,len_translated,shape_cond_subset_df,len_results) def get_neojunctions(self,add_control,dict_exonlist,outdir,filter_mode): self.valid, self.invalid, self.cond_df = multiple_crude_sifting(self.junction_count_matrix,add_control,dict_exonlist,outdir,filter_mode) self.subset = self.junction_count_matrix.loc[self.valid,:] self.cond_subset_df = self.cond_df.loc[self.valid,:] def get_neojunction_info(self,event): # more for B cell antigen, since we design it in a way that B cell pipeline completely rely on T pipeline for GTEx check ed = self.subset.loc[event,:].to_dict() freq = np.count_nonzero(np.array(list(ed.values())))/len(ed) return ed,freq
[docs] @staticmethod def get_membrane_tuples(df,**kwargs): ''' this function is used by SurfaceAntigen pipeline to filter out splicing evnets that are not tumor-specific and also compute useful informations for each membrane protein. :param df: pandas dataframe, the junction count matrix :param **kwargs: will be passed to __init__ of JunctionCountMatrixQuery class function :return membrane_tuples: a list, in which each item is a tuple (uid,mean_gtex,df_gtex,ed,freq). * uid: the uid of the splicing evnet * mean_gtex: the mean raw read count across GTEx * df_gtex: a dataframe of gtex sample name, tissue types and read count values * ed: expression dictionary for tumor samples * freq: expression frequency for tumor samples Example:: membrane_tuples = snaf.JunctionCountMatrixQuery.get_membrane_tuples(df) ''' from .surface import filter_to_membrane_protein jcmq = JunctionCountMatrixQuery(junction_count_matrix=df,**kwargs) print(jcmq) neojunctions = jcmq.valid membrane_uid = filter_to_membrane_protein(neojunctions) membrane_tuples = [] for uid in membrane_uid: mean_gtex, df_gtex = tumor_specificity(uid,method='mean',return_df=True) ed, freq = jcmq.get_neojunction_info(uid) membrane_tuples.append((uid,mean_gtex,df_gtex,ed,freq)) return membrane_tuples
@staticmethod def split_df_to_chunks(df,cores=None): df_index = np.arange(df.shape[0]) if cores is None: cores = mp.cpu_count() sub_indices = np.array_split(df_index,cores) sub_dfs = [df.iloc[sub_index,:] for sub_index in sub_indices] return sub_dfs @staticmethod def split_array_to_chunks(array,cores=None): if not isinstance(array,list): raise Exception('split_array_to_chunks function works for list, not ndarray') array_index = np.arange(len(array)) if cores is None: cores = mp.cpu_count() sub_indices = np.array_split(array_index,cores) sub_arrays = [] for sub_index in sub_indices: item_in_group = [] for i in sub_index: item_in_group.append(array[i]) sub_arrays.append(item_in_group) return sub_arrays @staticmethod def each_chunk_func(input_,kind,hlas=None,strict=False,sub_cond_df=None,binding_method=None): if kind == 1: nj_list = [] for uid in input_.index: nj = NeoJunction(uid=uid,count=50,check_gtex=False) nj.detect_type() nj.retrieve_junction_seq() nj.in_silico_translation(strict=strict) nj_list.append(nj) elif kind == 2: # out of development nj_list = [] for uid in input_.index: nj = NeoJunction(uid=uid,count=50,check_gtex=False) nj.detect_type() nj.retrieve_junction_seq() nj.in_silico_translation() nj.binding_prediction() nj.immunogenicity_prediction() nj_list.append(nj) elif kind == 3: # currently recommended nj_list = [] assert len(input_) == sub_cond_df.shape[0] for nj,index in tqdm(zip(input_,range(sub_cond_df.shape[0])),total=len(input_)): cond_row = sub_cond_df.iloc[index].tolist() hlas_row = list(compress(hlas,cond_row)) combined_unique_hlas = reduce(lambda a,b:list(set(a+b)),hlas_row) try: nj.binding_prediction(hlas=combined_unique_hlas,binding_method=binding_method) except Exception as e: if str(e) == 'Already no candidates after in-silico translation': nj_list.append(None) continue else: raise Exception('binding prediction error: {}'.format(e)) try: nj.immunogenicity_prediction() except Exception as e: if str(e) == 'Already no candidates after binding prediction': nj_list.append(None) continue else: raise Exception('immunogenic prediction error: {}'.format(e)) nj_list.append(nj) elif kind == 4: nj_list = [] for nj in input_: if nj is None: nj_list.append(None) continue try: nj.binding_prediction(hlas=hlas) except Exception as e: nj_list.append(None) continue try: nj.immunogenicity_prediction() except Exception as e: nj_list.append(None) continue nj_list.append(nj) return nj_list
[docs] def run(self,hlas,strict=False,outdir='.',name='after_prediction.p'): ''' main function to run mhc bound peptide T antigen pipeline :param hlas: list, each item is a list of 6 HLA types associated with each sample, the order of this list must be consistent with the sample name in the df column :param strict: boolean, default is False, determine whether to filter out peptide which doesn't have canonical start codon support :param outdir: string, the path where all the results will go into :param name: string, the name of the generated pickle result object Example:: jcmq.run(hlas=hlas,outdir='result',name='after_prediction.p') ''' self.parallelize_run(kind=1,strict=strict) print(self) self.parallelize_run(kind=3,hlas=hlas) self.serialize(outdir=outdir,name=name)
[docs] @staticmethod def generate_results(path,outdir,criterion=None): ''' wrapper function to automatically generate all necessary output :param path: string, where the pickle result file lie on the disk :param outdir: string, where to generate all the necessary result Example:: JunctionCountMatrixQuery.generate_results(path='result/after_prediction.p',outdir='result') ''' jcmq = JunctionCountMatrixQuery.deserialize(name=path) stage0_compatible_results(jcmq,outdir=outdir) for stage in [3,2,1]: jcmq.show_neoantigen_burden(outdir=outdir,name='burden_stage{}.txt'.format(stage),stage=stage,verbosity=1,contain_uid=False,criterion=criterion) jcmq.show_neoantigen_frequency(outdir=outdir,name='frequency_stage{}.txt'.format(stage),stage=stage,verbosity=1,contain_uid=False,plot=True,plot_name='frequency_stage{}.pdf'.format(stage),criterion=criterion) jcmq.show_neoantigen_frequency(outdir=outdir,name='frequency_stage{}_verbosity1_uid.txt'.format(stage),stage=stage,verbosity=1,contain_uid=True,plot=False,criterion=criterion) # add additional attributes df = pd.read_csv(os.path.join(outdir,'frequency_stage{}_verbosity1_uid.txt'.format(stage)),sep='\t',index_col=0) enhance_frequency_table(df,True,True,outdir,'frequency_stage{}_verbosity1_uid_gene_symbol_coord_mean_mle.txt'.format(stage)) # report candidates if stage == 3: dff = pd.read_csv(os.path.join(outdir,'frequency_stage{}_verbosity1_uid_gene_symbol_coord_mean_mle.txt'.format(stage)),sep='\t',index_col=0) for sample in tqdm(jcmq.junction_count_matrix.columns,total=jcmq.junction_count_matrix.shape[1]): report_candidates(jcmq,dff,sample,os.path.join(outdir,'T_candidates'),True) print('concatenating all T antigen files into one & indicate whether in AltAnalyze database') df_list = [] for sample in jcmq.junction_count_matrix.columns: df_list.append(pd.read_csv(os.path.join(outdir,'T_candidates','T_antigen_candidates_{}.txt'.format(sample)),sep='\t',index_col=0)) final_df = pd.concat(df_list,axis=0) all_uid = final_df['uid'].unique().tolist() mapping = is_in_db(all_uid) final_df['in_db'] = final_df['uid'].map(mapping).values final_df.to_csv(os.path.join(outdir,'T_candidates','T_antigen_candidates_all.txt'),sep='\t') # add additional attributes to stage0 df = pd.read_csv(os.path.join(outdir,'frequency_stage0.txt'),sep='\t',index_col=0) df.index = [','.join([item,item]) for item in df.index] df.to_csv(os.path.join(outdir,'frequency_stage0_verbosity1_uid.txt'),sep='\t') df = pd.read_csv(os.path.join(outdir,'frequency_stage0_verbosity1_uid.txt'),sep='\t',index_col=0) enhance_frequency_table(df,True,True,outdir,'frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt')
def parallelize_run(self,kind,hlas=None,strict=False): pool = mp.Pool(processes=self.cores) if kind == 1 or kind == 2: sub_dfs = JunctionCountMatrixQuery.split_df_to_chunks(self.subset,self.cores) r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(sub_df,kind,None,strict,)) for sub_df in sub_dfs] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.extend(result) self.translated = results elif kind == 3: sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.translated,self.cores) sub_cond_dfs = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores) r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(sub_array,kind,hlas,False,sub_cond_df,binding_method,)) for sub_array,sub_cond_df in zip(sub_arrays,sub_cond_dfs)] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.extend(result) self.results = (results,hlas) elif kind == 4: # heterogeneous hlas for each sample self.cond_subset_df = self.cond_df.loc[self.valid,:] need_to_predict_across_samples = [] # [[nj,None,nj,nj,None...],] for i,(label,content) in enumerate(self.cond_subset_df.iteritems()): need_to_predict = [] number_need = 0 for item in zip(self.translated,content.values): if item[1]: need_to_predict.append(item[0]) number_need += 1 else: need_to_predict.append(None) need_to_predict_across_samples.append(need_to_predict) print('{} has {} junctions to proceed predictions, HLA are {}'.format(label,number_need,hlas[i])) print('Binding and immunogenicity prediction starts') r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(need_to_predict,kind,hlas,)) for need_to_predict,hlas in zip(need_to_predict_across_samples,hlas)] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.append(result) self.results = results def serialize(self,outdir,name): if not os.path.exists(outdir): os.mkdir(outdir) with open(os.path.join(outdir,name),'wb') as f: pickle.dump(self,f,protocol=pickle.HIGHEST_PROTOCOL) @staticmethod def deserialize(name): with open(name,'rb') as f: jcmq = pickle.load(f) return jcmq
[docs] def visualize(self,uid,sample,outdir,tumor=False,criterion=[('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]): ''' Visualize certain Neojunction in certain sample :param uid: string, the uid for the event you want to check :param sample: string, the sample name :param outdir: string, where to deposite the figure :param tumor: bool, whether to show the expression level in tumor sample as well, default is not :param criterion: a nested tuple, if none of the neoantigens are bound and immunogenic, no candidates will be plotted, so you can relax the criterion a bit based on the format specified. Example:: jcmq.visualize(uid='ENSG00000167291:E38.6-E39.1',sample='TCGA-DA-A1I1-06A-12R-A18U-07.bed',outdir='./result') ''' if not os.path.exists(outdir): os.mkdir(outdir) if sample is not None: row_index = self.subset.index.tolist().index(uid) col_index = self.subset.columns.tolist().index(sample) results = self.results[0] hlas = self.results[1] nj = deepcopy(results[row_index]) nj.enhanced_peptides = nj.enhanced_peptides.filter_based_on_hla(selected_hla=hlas[col_index]) nj.visualize(outdir,'{}_{}.pdf'.format(uid.replace(':','_'),sample),criterion=criterion) if tumor: # in tumor sample fig,ax = plt.subplots() counts = self.junction_count_matrix.loc[uid,:] ax.plot(np.arange(len(counts)),counts.values,marker='o',markersize=3,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--') ax.set_xticks(np.arange(len(counts))) ax.set_xticklabels(counts.index.values,fontsize=1,rotation=90) ax.set_title('{}_tumor'.format(uid),fontsize=8) ax.set_ylim(bottom=-0.05) ax.set_ylabel('counts') plt.savefig(os.path.join(outdir,'{}_tumor.pdf'.format(uid.replace(':','_')))) plt.close()
def show_neoantigen_burden(self,outdir,name,stage,verbosity,contain_uid,criterion=None): if not os.path.exists(outdir): os.mkdir(outdir) sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores) sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores) hlas = self.results[1] pool = mp.Pool(processes=self.cores) r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_burden_single_run,args=(sub_array,sub_cond,hlas,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.append(result) burden_valid = pd.concat(results,axis=0) burden_valid.index = self.subset.index burden_valid.columns = self.subset.columns burden_invalid = self.junction_count_matrix.loc[self.invalid,:].copy() for col in burden_invalid.columns: burden_invalid.loc[:,col] = np.full(burden_invalid.shape[0],0) burden = pd.concat([burden_valid,burden_invalid],axis=0) # statistics burden['mean'] = burden.mean(axis=1) burden.loc['burden'] = burden.sum(axis=0) burden.sort_values(by='mean',axis=0,ascending=False,inplace=True) burden.sort_values(by='burden',axis=1,ascending=True,inplace=True) # reorder a bit c_list = burden.columns.tolist() c_list.remove('mean') c_list.append('mean') i_list = burden.index.tolist() i_list.remove('burden') i_list.append('burden') burden = burden.loc[i_list,c_list] burden.to_csv(os.path.join(outdir,name),sep='\t') def show_neoantigen_frequency(self,outdir,name,stage,verbosity,contain_uid,plot,plot_name=None,yscale='linear',criterion=None): if not os.path.exists(outdir): os.mkdir(outdir) sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores) sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores) hlas = self.results[1] column_names = self.subset.columns pool = mp.Pool(processes=self.cores) r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_frequency_single_run,args=(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.append(result) # let's define another function for merging dictionary, but not overwrite the values def merge_key_and_values(dic1,dic2): return {x:list(set(dic1.get(x,[]) + dic2.get(x,[]))) for x in set(dic1).union(set(dic2))} dic = reduce(merge_key_and_values,results) # statistics df = pd.Series(data=dic).to_frame(name='samples') df['n_sample'] = df.apply(lambda x:len(set(x[0])),axis=1).values df.sort_values(by='n_sample',ascending=False,inplace=True) df.to_csv(os.path.join(outdir,name),sep='\t') # plot if plot: fig,ax = plt.subplots() ax.bar(x=np.arange(df.shape[0]),height=df['n_sample'].values,edgecolor='k') ax.set_xlabel('Neoantigen rank by its occurence (descending order)') ax.set_ylabel('Occurence (n_sample)') ax.set_title('Neoantigen Occurence') plt.savefig(os.path.join(outdir,'x_neoantigen_{}'.format(plot_name)),bbox_inches='tight') plt.close() fig,ax = plt.subplots() try: sns.histplot(df['n_sample'].values,binwidth=1,kde=True,ax=ax) ax.set_yscale(yscale) plt.savefig(os.path.join(outdir,'x_occurence_{}'.format(plot_name)),bbox_inches='tight') plt.close() except ValueError: print('All neoantigens are present in same amount of patients, seaborn histplot can not properly figure out binedge, no occurence plot generated')
[docs] def show_neoantigen_as_fasta(self,outdir,name,stage,verbosity,contain_uid,sample=None,criterion=None): ''' write the neoantigen as a fasta file for MS validation or other purpose :param outdir: string ,the output directory for the generated fasta file :param name: string, the name of the output fasta file :param stage: int, either 0,1,2,3, the stage of neoantigen you want to report :param verbosity: int, either 1,2,3, the verbosity of candidates :param contain_uid: boolean, whether you want to contain the junction UID along with the neoantigen :param sample: string, the name of the sample in which you want to extract the neoantigen :param criterion: nested list, the criterion for filtering out the candidate Examples:: jcmq = snaf.JunctionCountMatrixQuery.deserialize('result/after_prediction.p') sample = 'SRR5933735.Aligned.sortedByCoord.out' jcmq.show_neoantigen_as_fasta(outdir='./fasta',name='neoantigen_{}.fasta'.format(sample),stage=2,verbosity=1,contain_uid=True,sample=sample) ''' if not os.path.exists(outdir): os.mkdir(outdir) sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores) sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores) hlas = self.results[1] column_names = self.subset.columns pool = mp.Pool(processes=self.cores) r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_frequency_single_run,args=(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.append(result) # let's define another function for merging dictionary, but not overwrite the values def merge_key_and_values(dic1,dic2): return {x:list(set(dic1.get(x,[]) + dic2.get(x,[]))) for x in set(dic1).union(set(dic2))} dic = reduce(merge_key_and_values,results) # statistics df = pd.Series(data=dic).to_frame(name='samples') df['n_sample'] = df.apply(lambda x:len(set(x[0])),axis=1).values df.sort_values(by='n_sample',ascending=False,inplace=True) # subset if sample is not None: df = df.loc[df.apply(lambda x:sample in x['samples'], axis=1),:] name = 'neoantigen_' + sample + '.fasta' # write to fasta with open(os.path.join(outdir,name),'w') as f: for row in df.itertuples(): if not contain_uid: # as the row.Index is a tuple tmp = [str(item) for item in row.Index] f.write('>{}|{}\n'.format('|'.join(tmp),row.n_sample)) f.write('{}\n'.format(row.Index[0])) else: # as the row.Index is a string delimited by comma f.write('>{}|{}\n'.format(row.Index.replace(',','|'),row.n_sample)) f.write('{}\n'.format(row.Index.split(',')[0]))
# let's define an atomic function inside here @staticmethod def show_neoantigen_burden_single_run(sub_array,sub_cond,hlas,stage,verbosity,contain_uid,criterion=None): nj_burden_2d = [] for nj,index in zip(sub_array,range(sub_cond.shape[0])): nj_burden = [] cond_row = sub_cond.iloc[index].values for hla,cond in zip(hlas,cond_row): if nj is None or not cond : nj_burden.append(0) else: nj_copy = deepcopy(nj) nj_copy.enhanced_peptides = nj_copy.enhanced_peptides.filter_based_on_hla(selected_hla=hla) nj_copy.derive_candidates(stage=stage,verbosity=verbosity,contain_uid=contain_uid,criterion=criterion) nj_burden.append(len(nj_copy.candidates)) nj_burden_2d.append(nj_burden) df = pd.DataFrame(data=nj_burden_2d) return df # let's define the single function to run in each subprocess @staticmethod def show_neoantigen_frequency_single_run(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion=None): dic = {} for nj,index in zip(sub_array,range(sub_cond.shape[0])): cond_row = sub_cond.iloc[index].values for hla,column_name,cond in zip(hlas,column_names,cond_row): if nj is not None and cond: nj_copy = deepcopy(nj) nj_copy.enhanced_peptides = nj_copy.enhanced_peptides.filter_based_on_hla(selected_hla=hla) nj_copy.derive_candidates(stage=stage,verbosity=verbosity,contain_uid=contain_uid,criterion=criterion) for cand in nj_copy.candidates: try: dic[cand].append(column_name) except KeyError: dic[cand] = [] dic[cand].append(column_name) return dic
class EnhancedPeptides(): def __init__(self,peptides,hlas,kind): # kind means how to instantiate the EnhancedPeptides object if kind == 2: # hla-reduced enhanced peptide, this mode is used when filtsre_based_on_hla self.mers = peptides self.info = hlas else: self.mers = [] self.info = [] for k,v in peptides.items(): self.mers.append(k) phlas = {} # {pep1:{origin:(extra,n_from_first,phase,evidences),hla1:{},hla2:{}}} for pep in v: if kind == 0: # each pep is (pep,extra,n_from_first,phase,evidences), this mode is used before binding_prediction pairs = {} pairs['origin'] = (pep[1],pep[2],pep[3],pep[4]) for hla in hlas: pairs[hla] = {} phlas[pep[0]] = pairs elif kind == 1: # each pep is (pep,extra,n_from_first,hla,phase,evidences), this mode is used when filter_based_on_criterion try: phlas[pep[0]]['origin'] = (pep[1],pep[2],pep[4],pep[5]) except KeyError: phlas[pep[0]] = {} phlas[pep[0]]['origin'] = (pep[1],pep[2],pep[4],pep[5]) phlas[pep[0]][pep[3]] = {} self.info.append(phlas) def __str__(self): return str(self.info) def __getitem__(self,key): index = self.mers.index(key) return self.info[index] def is_empty(self): total = 0 for dic in self.info: total += len(dic) if total == 0: return True else: return False def simplify_to_list(self,verbosity): # make sure to first filter and re-instantiate if verbosity==1: # only peptide, result is also tuple result_list = [] for mer in self.mers: result_list.extend(list(self[mer].keys())) # utlize __getitem__ method, so self[9] instead of self[0] result_list = [(item,) for item in result_list] elif verbosity==2: # peptide and hla result_list = [] for mer in self.mers: for pep,hla_complex in self[mer].items(): for hla in hla_complex.keys(): if hla != 'origin': result_list.append((pep,hla)) elif verbosity==3: result_list = [] for mer in self.mers: for pep,hla_complex in self[mer].items(): extra,n_from_first,phase,evidences = hla_complex['origin'] for hla in hla_complex.keys(): if hla != 'origin': result_list.append((pep,hla,extra,n_from_first)) return result_list def register_attr(self,df,attr_name): ''' df should follow: peptide mer hla score identity AAAAAAAAA 9 HLA-A*01:01 0.3 SB ''' for mer,sub_df in df.groupby(by='mer'): index = self.mers.index(int(mer)) for peptide,sub_sub_df in sub_df.groupby(by='peptide'): for row in sub_sub_df.itertuples(index=False): self.info[index][peptide][row.hla][attr_name] = (float(row.score),str(row.identity)) def filter_based_on_hla(self,selected_hla,reinstantiate=True): new_mers = [] new_info = [] selected_hla = set(selected_hla) for i,k in enumerate(self.mers): new_info_container = {} for pep,hla_complex in self.info[i].items(): reduced_hla_complex = {} for hla,attrs in hla_complex.items(): if hla == 'origin': reduced_hla_complex['origin'] = attrs else: if hla in selected_hla: reduced_hla_complex[hla] = attrs new_info_container[pep] = reduced_hla_complex new_mers.append(k) new_info.append(new_info_container) if reinstantiate: ep = EnhancedPeptides(peptides=new_mers,hlas=new_info,kind=2) return ep def filter_based_on_criterion(self,criteria,reinstantiate=True): # criterion: [(net),], ['netMHCpan_el',1,==,SB] peptides = {k:[] for k in self.mers} for i,k in enumerate(self.mers): for pep,hla_complex in self.info[i].items(): extra,n_from_first,phase,evidences = hla_complex['origin'] for hla,attrs in hla_complex.items(): if hla == 'origin': continue boolean_list = [] for criterion in criteria: if criterion[1] == 1: # matched is not a number eval_string = 'attrs[\'{}\'][{}] {} \'{}\''.format(criterion[0],criterion[1],criterion[2],criterion[3]) elif criterion[1] == 0: # matched is a number eval_string = 'attrs[\'{}\'][{}] {} {}'.format(criterion[0],criterion[1],criterion[2],criterion[3]) try: boolean = eval(eval_string) except KeyError: boolean = False boolean_list.append(boolean) boolean_final = all(boolean_list) if boolean_final: peptides[k].append((pep,extra,n_from_first,hla,phase,evidences)) # if not reinstantiate, this format is called reduced form if reinstantiate: return EnhancedPeptides(peptides,None,1) else: return peptides class NeoJunction(): def __init__(self,uid,count,check_gtex): if check_gtex: NeoJunction.is_neojunction(uid,count) self.uid = uid self.count = count def __str__(self): try: ts = self.tumor_specificity except AttributeError: ts = None try: et = self.event_type except AttributeError: et = None try: j = self.junction[:5] + '...' + self.junction[-5:] except AttributeError: j = None try: peptides = {k:len(v) for k,v in self.peptides.items()} except AttributeError: peptides = None try: ep = {i:len(self.enhanced_peptides[i]) for i in self.enhanced_peptides.mers} except AttributeError: ep = None try: cand_len = len(self.candidates) cand_example = self.candidates[0] except AttributeError: cand_len,cand_example = None,None return 'uid: {}\n'\ 'count: {}\n'\ 'tumor specificity: {}\n'\ 'event type: {}\n'\ 'junction: {}\n'\ 'peptides: {}\n'\ 'Enhanced peptides: {}\n'\ 'Candidates: len is {}, first is {}\n'.format(self.uid,self.count,ts,et,j,peptides,ep,cand_len,cand_example) @staticmethod def is_neojunction(uid,count): identity,detail = crude_tumor_specificity(uid,count) if not identity: raise Exception('This is not a NeoJunction, instantiation fails') def infer_tumor_specificity(self,method): tumor_specificity = accurate_tumor_specificity(self.uid,method) self.tumor_specificity = tumor_specificity return tumor_specificity def gtex_viewer(self,kind,outdir): if kind == 1: # line plot, all tissues, count (norm or not) gtex_visual_count(self.uid,norm=True,out_folder=outdir) elif kind == 2: # hist plot, combined, norm count gtex_visual_norm_count_combined(self.uid,out_folder=outdir) elif kind == 3: # hist plot, per tissue, number of samples that are non-zero in each tissue gtex_visual_per_tissue_count(self.uid,out_folder=outdir) def detect_type(self): ''' Ordinary: ENSG00000107902:E10.1-E12.1 Alt3: ENSG00000110057:E5.1-E6.2_67996641 Alt5: ENSG00000100321:E7.1_39364266-E8.1 Intron Retention: ENSG00000115524:I4.1-E5.1 Novel Exon: ENSG00000008441:I40.1_13076665-E41.1 Trans-splicing: ENSG00000196565:E14.2-ENSG00000213934:E3.1 UTR Event: ENSG00000164068:U0.1_49689185-E2.1 ''' valid_pattern = re.compile(r'^ENSG\d+:.+?-.+') if re.search(valid_pattern,self.uid): # at least valid one if len(re.findall('ENSG',self.uid)) == 2: event_type = 'trans_splicing' elif 'U' in self.uid: event_type = 'utr_event' elif '_' in self.uid: subexon12 = self.uid.split(':')[1] subexon1, subexon2 = subexon12.split('-') if 'I' in subexon12: event_type = 'novel_exon' elif '_' in subexon1 and '_' in subexon2: event_type = 'alt5_alt3' elif '_' in subexon1 and '_' not in subexon2: event_type = 'alt5' elif '_' in subexon2 and '_' not in subexon1: event_type = 'alt3' else: event_type = 'invalid' elif 'I' in self.uid: event_type = 'intron_retention' elif re.search(r'^ENSG\d+:E\d+\.\d+-E\d+\.\d+$',self.uid): e = self.uid.split(':')[1] e1 = e.split('-')[0] e2 = e.split('-')[1] e1_int = int(e1.split('.')[0][1:]) e1_frac = int(e1.split('.')[1]) e2_int = int(e2.split('.')[0][1:]) e2_frac = int(e2.split('.')[1]) if e1 == e2: # E5.1-E5.1 event_type = 'invalid' else: if e1_int > e2_int or (e1_int==e2_int and e1_frac>e2_frac): # E13.1-E12.4 or E12.10-E12.9 event_type = 'invalid' else: event_type = 'ordinary' else: event_type = 'invalid' else: event_type = 'invalid' self.event_type = event_type return event_type def retrieve_junction_seq(self): if self.event_type != 'invalid': ensid = self.uid.split(':')[0] subexon1,subexon2 = ':'.join(self.uid.split(':')[1:]).split('-') code = is_consecutive(subexon1,subexon2) seq1 = subexon_tran(subexon1,ensid,'site1',code) seq2 = subexon_tran(subexon2,ensid,'site2',code) junction = ','.join([seq1,seq2]) self.junction = junction else: self.junction = '$' * 10 # indicating invalid uid def in_silico_translation(self,ks=[9,10],strict=False): peptides = {k:[] for k in ks} coord = uid_to_coord(self.uid) # chr1:555-666(+) if '$' not in self.junction and '*' not in self.junction and '#' not in self.junction and 'unknown' not in coord: first,second = self.junction.split(',') # annotate peptide based on starting codon evidence ensg = self.uid.split(':')[0] strand = coord.split('(')[1].rstrip(')') if strand == '+': coord_first_exon_last_base = coord.split(':')[1].split('-')[0] elif strand == '-': coord_first_exon_last_base = coord.split(':')[1].split('(')[0].split('-')[1] possible_start_codon_coord = dict_start_codon.get(ensg,[]) # [333,444] or [] support_phases_dict = {} # {0:[(333,enst,+)]} or {} for pssc in possible_start_codon_coord: supports = get_support_phase(ensg,coord_first_exon_last_base,pssc,strand,len(first)) for (phase_, pssc_, enst_, strand_) in supports: support_phases_dict.setdefault(phase_,[]).append((pssc_,enst_,strand_)) for phase in [0,1,2]: # tranlation starts with index "phase" evidences = tuple(support_phases_dict.get(phase,[])) if strict and len(evidences)==0: # in strict mode, without start_codon evidence, let's skip this phase continue de_facto_first = first[phase:] pep_dict = get_peptides(de_facto_first,second,ks,phase,evidences) # (pep,extra,n_from_first,phase,evidences) for k in ks: peptides[k].extend(pep_dict[k]) peptides[k] = list(set(peptides[k])) self.peptides = peptides return peptides def binding_prediction(self,hlas,binding_method=None): hlas = list(set(hlas)) ep = EnhancedPeptides(self.peptides,hlas,0) if ep.is_empty(): raise Exception('Already no candidates after in-silico translation') for k,v in self.peptides.items(): try: v = list(zip(*v))[0] except IndexError: continue if binding_method == 'netMHCpan': df = run_netMHCpan(software_path,v,hla_formatting(hlas,'netMHCpan_output','netMHCpan_input'),k) elif binding_method == 'MHCflurry': df = run_MHCflurry(v,hla_formatting(hlas,'netMHCpan_output','deepimmuno')) df['hla'] = hla_formatting(df['hla'].tolist(),'deepimmuno','netMHCpan_output') ep.register_attr(df,'netMHCpan_el') self.enhanced_peptides = ep def immunogenicity_prediction(self): reduced = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),],False) ep = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),],True) if ep.is_empty(): raise Exception('Already no candidates after binding prediction') for k,v in reduced.items(): try: v_pep,v_hla = list(zip(*v))[0],list(zip(*v))[3] except IndexError: # one of the mer, no candicate, but the other mer has, so pass the empty check continue data = np.column_stack((v_pep,v_hla)) df_input = pd.DataFrame(data=data) df_input[1] = hla_formatting(df_input[1].tolist(),'netMHCpan_output','deepimmuno') df_output = run_deepimmuno(df_input) df = pd.DataFrame({'peptide':df_output['peptide'].values,'mer':[k]*df_output.shape[0], 'hla':hla_formatting(df_output['HLA'].values.tolist(),'deepimmuno','netMHCpan_output'), 'score':df_output['immunogenicity'].values, 'identity':[True if item > 0.5 else False for item in df_output['immunogenicity'].values]}) self.enhanced_peptides.register_attr(df,attr_name='deepimmuno_immunogenicity') def derive_candidates(self,stage,verbosity,contain_uid=True,criterion=None): if stage == 1: # all translated peptides self.candidates = self.enhanced_peptides.simplify_to_list(verbosity=verbosity) elif stage == 2: # all bound peptides self.candidates = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),]).simplify_to_list(verbosity=verbosity) elif stage == 3: # immunogenic peptides self.candidates = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]).simplify_to_list(verbosity=verbosity) elif stage == 'custom': self.candidates = self.enhanced_peptides.filter_based_on_criterion(criterion).simplify_to_list(verbosity=verbosity) if contain_uid: new = [] for item in self.candidates: tmp = [str(i) for i in item] tmp.append(self.uid) new.append(','.join(tmp)) self.candidates = new def visualize(self,outdir,name,criterion=[('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]): reduced = self.enhanced_peptides.filter_based_on_criterion(criterion,False) # now each element should add phase and evidences as well ''' {9: [('LPSPPAQEL', 2, 0, 'HLA-B*08:01'), ('LPSPPAQEL', 2, 0, 'HLA-B*08:02'), ('SLYLLLQHR', 1, 2, 'HLA-A*68:01')], 10: [('TSLYLLLQHR', 1, 3, 'HLA-A*68:01'), ('TLPSPPAQEL', 2, 1, 'HLA-A*02:01'), ('TLPSPPAQEL', 2, 1, 'HLA-A*24:02'), ('TLPSPPAQEL', 2, 1, 'HLA-B*08:01'), ('TLPSPPAQEL', 2, 1, 'HLA-B*08:02')]} ''' n_axes = 0 to_draw = [] for k,v in reduced.items(): to_draw.extend(v) n_axes += len(v) ncols = 4 nrows = n_axes // ncols + 1 + 1 height_ratios = [0.2 if i == 0 else (1-0.2)/(nrows-1) for i in range(nrows)] fig = plt.figure() gs = mpl.gridspec.GridSpec(nrows=nrows,ncols=ncols,height_ratios=height_ratios,wspace=0.5,hspace=0.5) ax_genome = fig.add_subplot(gs[0,:]) axes_list = [] for i in range(1,gs.nrows): for j in range(0,gs.ncols): ax = fig.add_subplot(gs[i,j]) axes_list.append(ax) ax_genome = draw_genome(ax_genome,self.uid,dict_exonCoords) for i,ax in enumerate(axes_list): if i < n_axes: # info aa, extra, n_from_first, hla, phase, evidences = to_draw[i] first,second = self.junction.split(',') dna_first = extra + n_from_first * 3 dna_second = -extra + (len(aa)-n_from_first) * 3 binding_score = self.enhanced_peptides[len(aa)][aa][hla]['netMHCpan_el'][0] immunogenicity_score = self.enhanced_peptides[len(aa)][aa][hla]['deepimmuno_immunogenicity'][0] # draw ax = show_candicates(ax,aa,extra,n_from_first,hla,phase,evidences,first,second,dna_first,dna_second,binding_score,immunogenicity_score) else: ax.axis('off') fig.subplots_adjust(top=0.9) fig.suptitle('{} Count:{}'.format(self.uid,self.count)) plt.savefig(os.path.join(outdir,name),bbox_inches='tight') plt.close() # processing functions def hla_formatting(pre,pre_type,post_type): if pre_type == 'netMHCpan_output' and post_type == 'netMHCpan_input': # HLA-A*01:01 to HLA-A01:01 post = [hla.replace('*','') for hla in pre] elif pre_type == 'netMHCpan_input' and post_type == 'netMHCpan_output': # HLA-A01:01 to HLA-A*01:01 post = [hla[:5] + '*' + hla[5:] for hla in pre] elif pre_type == 'netMHCpan_output' and post_type == 'deepimmuno': # HLA-A*01:01 to HLA-A*0101 post = [hla.replace(':','') for hla in pre] elif pre_type == 'deepimmuno' and post_type == 'netMHCpan_output': # HLA-A*0101 to HLA-A*01:01 post = [hla[:8] + ':' + hla[-2:] for hla in pre] return post def get_peptides(de_facto_first,second,ks,phase,evidences): peptides = {k:[] for k in ks} extra = len(de_facto_first) % 3 # how many base left in first assuming no stop condon in front of it. num = len(de_facto_first) // 3 # how many set of codons in the first. aa_first = str(Seq(de_facto_first).translate(to_stop=True)) # if not the multiple of 3, only translate the part that are divisible by 3, it will be a warning for BioPython if len(aa_first) == num: # successfully read through if extra == 0: continue_second = second elif extra == 1: continue_second = de_facto_first[-1] + second elif extra == 2: continue_second = de_facto_first[-2:] + second aa_second = str(Seq(continue_second).translate(to_stop=True)) if len(aa_second) > 0: # at least, not '' for k in ks: second_most = min(k,len(aa_second)) # the max allowed number of residue for aa_second first_most = len(aa_first) # the max allowed number of residue for aa_first for n_from_second in range(second_most,0,-1): n_from_first = k - n_from_second if n_from_first == 0 and extra == 0: ''' cttca cct cac ttt acc ttc tcc tcc agc aca gga act agg aac tac gga gag aga agc caa S P H F T F S S S T G T R N Y G E R S Q the comma is between "ttt" and "acc" so, when extra == 0, means whole second will be after the junction, in this case, we have to require the peptide at least include a amino acid from first part, otherwise, TFSSSTGTR won't be a splice-derived peptide. this example can be reproduced using: nj = NeoJunction('ENSG00000223572:E15.1-E15.2') ''' continue if n_from_first <= first_most: if n_from_first > 0: pep = aa_first[-n_from_first:] + aa_second[:n_from_second] elif n_from_first == 0: pep = aa_second[:n_from_second] peptides[k].append((pep,extra,n_from_first,phase,evidences)) return peptides def query_from_dict_fa(abs_start,abs_end,EnsID,strand): ''' abs_start and abs_end always means the xth base in forward strand the returned exon_seq, however, means the 5'-3' seq depending on the strand information. ''' if strand == '+': start = int(dict_fa[EnsID][1]) end = int(dict_fa[EnsID][2]) seq = dict_fa[EnsID][3] start_index = int(abs_start) - start + 2000 end_index = int(abs_end) - start + 1 + 2000 exon_seq = seq[start_index:end_index] elif strand == '-': start = int(dict_fa[EnsID][1]) end = int(dict_fa[EnsID][2]) seq_reverse = dict_fa[EnsID][3] seq_forward = str(Seq(seq_reverse).reverse_complement()) # Hs_gene.fa restore the reverse strand info start_index = int(abs_start) - start + 2000 end_index = int(abs_end) - start + 1 + 2000 # endpoint in python is non-inclusive exon_seq_1 = seq_forward[start_index:end_index] s = Seq(exon_seq_1) exon_seq = str(s.reverse_complement()) return exon_seq def utrAttrs(EnsID): # try to get U0.1's attribute, but dict_exonCoords doesn't have, so we just wanna get the first entry for its EnsGID exonDict = dict_exonCoords[EnsID] attrs = next(iter(exonDict.values())) chr_,strand = attrs[0],attrs[1] return chr_,strand def utrJunction(site,EnsGID,strand,chr_,flag,seq_len=100): # U0.1_438493849, here 438493849 means the site (suffix) if flag == 'site1' and strand == '+': # U0.1_438493849 - E2.2 otherSite = int(site) - seq_len + 1 # extract UTR with length = 100 exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site)) elif flag == 'site1' and strand == '-': otherSite = int(site) + seq_len - 1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite)) exon_seq = str(Seq(exon_seq).reverse_complement()) elif flag == 'site2' and strand == '+': # E5.3 - U5.4_48374838 otherSite = int(site) + seq_len -1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite)) elif flag == 'site2' and strand == '-': otherSite = int(site) - seq_len + 1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site)) exon_seq = str(Seq(exon_seq).reverse_complement()) return exon_seq def retrieveSeqFromUCSCapi(chr_,start,end): url = 'http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment={0}:{1},{2}'.format(chr_,start,end) response = requests.get(url) status_code = response.status_code assert status_code == 200 try: my_dict = xmltodict.parse(response.content) except: exon_seq = '#' * 10 # indicating the UCSC doesn't work return exon_seq exon_seq = my_dict['DASDNA']['SEQUENCE']['DNA']['#text'].replace('\n','').upper() return exon_seq def is_consecutive(subexon1,subexon2): # I only care of E3.3-E3.4, other form will automatically being labelled as not consecutive # the return code: 0 -> not consecutive, 1 -> consecutive code = 0 pattern1 = re.compile(r'^E(\d{1,3})\.(\d{1,3})$') match1 = re.search(pattern1,subexon1) if match1: captured_group1 = match1.group(1) captured_group2 = match1.group(2) pattern2 = re.compile(rf'^E{captured_group1}\.{int(captured_group2)+1}$') match2 = re.search(pattern2,subexon2) if match2: code = 1 return code def subexon_tran(subexon,EnsID,flag,code): # flag either site1 or site2 ''' 1. subexon can take multiple forms depending on the event type E1.2 or I3.4 E6.2_67996641 or I40.1_13076665, also depending on whether they are subexon1 or subexon2 ENSG00000213934:E3.1 or ENSG00000213934:E2.1_473843893894 U0.1_49689185 2. everything with trailing suffix will depend on the subexon1 or subexon2, but sometimes, it is fixed (trans-splicing can only be in subexon2) 3. to be clear, the exon_seq returned is always 5'-3' sequence, not forward anymore. if code is 1, meaning its consecutive, E3.3-E3.4, if the flag is site2, then offset the start position of it by 1, the start position is the conceptual start position, if it is in negative string, it will be operated by the stop position. ''' # let's remove those in the first place # if EnsID not in set(dict_exonCoords.keys()): # exon_seq = '*' * 10 # indicator for error on MultiPath-PSI itself # return exon_seq try: # E1.2 or I3.4 attrs = dict_exonCoords[EnsID][subexon] # [chr,strand,start,end,suffer] if code == 1 and flag == 'site2': if attrs[1] == '+': exon_seq = query_from_dict_fa(int(attrs[2])+1,attrs[3],EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],int(attrs[3])-1,EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],attrs[3],EnsID,attrs[1]) except KeyError: if ':' in subexon: # ENSG00000213934:E3.1 fusionGeneEnsID = subexon.split(':')[0] fusionGeneExon = subexon.split(':')[1] if '_' in fusionGeneExon: # ENSG:E2.1_473843893894 suffix = fusionGeneExon.split('_')[1] subexon = fusionGeneExon.split('_')[0] attrs = dict_exonCoords[fusionGeneEnsID][subexon] if attrs[1] == '+': exon_seq = query_from_dict_fa(suffix,attrs[3],fusionGeneEnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],suffix,fusionGeneEnsID,attrs[1]) else: # ENSG:E2.1 try: attrs = dict_exonCoords[fusionGeneEnsID][fusionGeneExon] except KeyError: exon_seq = '*' * 10 # indicator for error on MultiPath-PSI itself else: exon_seq = query_from_dict_fa(attrs[2],attrs[3],fusionGeneEnsID,attrs[1]) else: # could be trailing or utr, or non-existing ordinary subexon try: suffix = subexon.split('_')[1] except IndexError: # the logic is there's a subexon E45.3, it is no trailing, but just not in the exonCoords. exon_seq = '*' * 10 # indicator for error on MultiPath-PSI itself else: subexon = subexon.split('_')[0] try: attrs = dict_exonCoords[EnsID][subexon] except KeyError: # must be UTR chrUTR,strandUTR = utrAttrs(EnsID) # this is get from a random subexon under that EnsID exon_seq = utrJunction(suffix,EnsID,strandUTR,chrUTR,flag) else: # must be trailing if flag == 'site2': if attrs[1] == '+': exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1]) elif flag == 'site1': # not affected by overhang since it is site1 if attrs[1] == '+': exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1]) return exon_seq
[docs]def enhance_frequency_table(df,remove_quote=True,save=True,outdir='',name=None): ''' This is a wrapper function to add (1) gene symbol, (2) chromosome coordinate, (3) tumor specificity to the frequency table at once :param df: the dataframe for the frequency table (freq_stage{n}_verbosity1_uid.txt) :param remove_quote: boolean, depending on how your df is loaded, if directly read from the disk, certain column will contain quotation, whether to remove it or not :param save: boolean, whether to save the result or not, default is True :param outdir: string, the output folder. :param name: string, the output file name Example:: enhance_frequency_table(df=df,True,True,'result','freq_stage3_verbosity1_uid_gene_symbol_coord_mean_mle.txt') ''' print('adding gene symbol') df = add_gene_symbol_frequency_table(df=df,remove_quote=remove_quote) print('adding chromosome coordinates') df = add_coord_frequency_table(df,remove_quote=False) print('adding tumor specificity mean raw count') df = add_tumor_specificity_frequency_table(df,'mean',False) print('adding tumor specificity MLE score') df = add_tumor_specificity_frequency_table(df,'mle',False) if save: df.to_csv(os.path.join(outdir,name),sep='\t') return df
[docs]def add_coord_frequency_table(df,remove_quote=True): ''' Convert the uid to chromsome coordinates :param df: the input df, index must be uid :param remove_quote: bool, depending on how your df is loaded, if directly read from the disk, certain column will contain quotation, whether to remove it or not :return df: the output df, with added coordinate column Exmaple:: add_coord_frequency_table(df=df,remove_quote=True) ''' # the index has to be comma separated string from ast import literal_eval if remove_quote: df['samples'] = [literal_eval(item) for item in df['samples']] uid_list = [item.split(',')[1] for item in df.index] coord_list = [uid_to_coord(uid) for uid in uid_list] df['coord'] = coord_list return df
def uid_to_coord(uid): tmp_list = uid.split(':') if len(tmp_list) == 2: ensg,exons = tmp_list elif len(tmp_list) == 3: ensg = tmp_list[0] exons = ':'.join(tmp_list[1:]) first,second = exons.split('-') # figure out start_coord if '_' in first: actual_exon,trailing = first.split('_') try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'U' in actual_exon: proxy_exon = list(dict_exonCoords[ensg].keys())[0] attrs = dict_exonCoords[ensg][proxy_exon] chrom = attrs[0] strand = attrs[1] start_coord = trailing else: # probably a rare error chrom = 'unknown' strand = 'unknown' start_coord = 'unknown' else: chrom = attrs[0] strand = attrs[1] start_coord = trailing else: actual_exon = first try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: chrom = 'unkonwn' strand = 'unknown' start_coord = 'unknown' else: chrom = attrs[0] strand = attrs[1] if strand == '+': start_coord = attrs[3] # end else: start_coord = attrs[2] # start # figure out end_coord if '_' in second: actual_exon,trailing = second.split('_') try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'U' in actual_exon: end_coord = trailing elif 'ENSG' in actual_exon: end_coord = trailing else: end_coord = 'unknown' else: end_coord = trailing else: actual_exon = second try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'ENSG' in actual_exon: ensg_second, actual_exon_second = actual_exon.split(':') try: attrs = dict_exonCoords[ensg_second][actual_exon_second] except KeyError: end_coord = 'unknown' else: if strand == '+': end_coord = attrs[2] # start else: end_coord = attrs[3] # end else: end_coord = 'unknown' else: if strand == '+': end_coord = attrs[2] # start else: end_coord = attrs[3] # end # assemble if strand == '+': assemble = '{}:{}-{}({})'.format(chrom,start_coord,end_coord,strand) else: assemble = '{}:{}-{}({})'.format(chrom,end_coord,start_coord,strand) return assemble