Source code for snaf.snaf

#!/data/salomonis2/LabFiles/Frank-Li/refactor/neo_env/bin/python3.7

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
from functools import reduce
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib as mpl
import anndata as ad
import seaborn as sns
from scipy import stats
from scipy.optimize import minimize
import re
import requests
import xmltodict
import multiprocessing as mp
import functools
from tqdm import tqdm
from itertools import compress
from ast import literal_eval
from bisect import bisect, bisect_left


# for biopython, pip install biopython
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.Seq import Seq

# other modules
from .binding import run_netMHCpan,run_MHCflurry
from .deepimmuno import run_deepimmuno
from .data_io import *
from .visualize import *
from .gtex import *
from .gtex_viewer import *
from .downstream import *


import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'


# configuration
def snaf_configuration(exon_table,transcript_db,db_dir,fasta,software_path_arg=None,binding_method_arg=None):
    global dict_exonCoords
    global dict_fa
    global software_path
    global binding_method
    global dict_exonlist
    global dict_start_codon
    global phase_inferer_gtf_dict
    dict_exonCoords = exonCoords_to_dict(exon_table)
    dict_exonlist = construct_dict_exonlist(transcript_db)
    dict_fa = fasta_to_dict(fasta)
    software_path = software_path_arg
    binding_method = binding_method_arg

    df_start_codon = pd.read_csv(os.path.join(db_dir,'Alt91_db','df_start_codon.txt'),sep='\t',index_col=0)
    df_start_codon['start_codon'] = [literal_eval(item) for item in df_start_codon['start_codon']]
    df_start_codon['non_redundant'] = [literal_eval(item) for item in df_start_codon['non_redundant']]
    dict_start_codon = df_start_codon['start_codon'].to_dict()

    phase_inferer_gtf_dict = process_gtf(os.path.join(db_dir,'Alt91_db','Homo_sapiens.GRCh38.91.gtf'))

    


def process_gtf(gtf):
    # any official ensembl release gtf format
    '''
    gtf_dict['ENSG00000186092']

        {'ENST00000641515': [(65419, 65433), (65520, 65573), (69037, 71585)],
        'ENST00000335137': [(69055, 70108)]}
    '''
    gtf_dict = {}
    with open(gtf,'r') as f:
        for line in f:
            try:
                chrom, source, typ, start, end, score, strand, phase, attrs = line.rstrip('\n').split('\t')
            except ValueError:  # the header line
                continue
            if typ == 'gene':
                ensg = attrs.split(';')[0].split(' ')[1].strip('"')
                gtf_dict[ensg] = {}
            elif typ == 'transcript':
                enst = attrs.split(';')[2].split(' ')[2].strip('"')
                gtf_dict[ensg][enst] = []
            elif typ == 'exon':
                gtf_dict[ensg][enst].append((int(start),int(end)))
    return gtf_dict

def get_support_phase(ensg, coord_first_exon_last_base, pssc, strand, length_first):
    coord_first_exon_last_base, pssc, length_first = int(coord_first_exon_last_base), int(pssc), int(length_first)
    all_trans = phase_inferer_gtf_dict[ensg]
    supports = []
    if strand == '+':
        for enst, tran in all_trans.items():
            lis = []
            [lis.extend(list(exon)) for exon in tran]
            pssc_insert_pos = bisect(lis, pssc)
            coord_first_exon_first_base = coord_first_exon_last_base - length_first + 1
            junction_insert_pos = bisect(lis, coord_first_exon_first_base)

            if junction_insert_pos % 2 == 1 and pssc_insert_pos % 2 == 1:
                if junction_insert_pos == pssc_insert_pos:
                    n_bases = coord_first_exon_first_base - pssc + 1
                elif junction_insert_pos > pssc_insert_pos:
                    start_exon_index = (pssc_insert_pos - 1) // 2
                    end_exon_index = (junction_insert_pos - 1) // 2
                    n_bases = 0
                    for i, exon in enumerate(tran):
                        if i == start_exon_index:
                            n_bases += (exon[1] - pssc + 1)
                        elif i == end_exon_index:
                            n_bases += (coord_first_exon_first_base - exon[0] + 1)
                        else:
                            if i > start_exon_index and i < end_exon_index:
                                n_bases += (exon[1] - exon[0] + 1)
                else:
                    continue
            else:
                continue


            remainder = n_bases % 3
            '''
            *   *   *   *   |*   *   *    *    *
                            |6   7   8    9    10

            remainder means a fragment including the first base in the first exon, how many left. 1 left means 6 should be the first base of new codon
            2 left means 6 should be 6 is the second in last codon, 7 will be the last base in the last codon, have to start with 8
            0 left means 6 should be the last base in the last codon, so that we start with 7
            '''
            if remainder == 1:
                phase = 0
            elif remainder == 2:
                phase = 2
            elif remainder == 0:
                phase = 1
            supports.append((phase, pssc, enst,strand))

    elif strand == '-':
        for enst, tran in all_trans.items():
            tran.sort(
                key=lambda x: x[0])  # because negative strand looks like [(5,7),(2,4)], we change it to [(2,4),(5,7)]
            lis = []
            [lis.extend(list(exon)) for exon in tran]
            pssc_insert_pos = bisect_left(lis, pssc)
            coord_first_exon_first_base = coord_first_exon_last_base + length_first - 1
            junction_insert_pos = bisect_left(lis, coord_first_exon_first_base)

            if junction_insert_pos % 2 == 1 and pssc_insert_pos % 2 == 1:
                if junction_insert_pos == pssc_insert_pos:
                    n_bases = pssc - coord_first_exon_first_base + 1
                elif junction_insert_pos < pssc_insert_pos:
                    start_exon_index = (pssc_insert_pos - 1) // 2
                    end_exon_index = (junction_insert_pos - 1) // 2
                    n_bases = 0
                    for i, exon in enumerate(tran):
                        if i == start_exon_index:
                            n_bases += (pssc - exon[0] + 1)
                        elif i == end_exon_index:
                            n_bases += (exon[1] - coord_first_exon_first_base + 1)
                        else:
                            if i > end_exon_index and i < start_exon_index:
                                n_bases += (exon[1] - exon[0] + 1)
                else:
                    continue
            else:
                continue



            remainder = n_bases % 3  # same idea as above forward strand, see explanation above
            if remainder == 1:
                phase = 0
            elif remainder == 2:
                phase = 2
            elif remainder == 0:
                phase = 1
            supports.append((phase, pssc, enst, strand))

    return supports   



    
def is_in_db(valid):
    # provide a list with uid without repeats
    mapping = {}
    for uid in tqdm(valid):
        ensg = uid.split(':')[0]
        exons = ':'.join(uid.split(':')[1:])
        if '_' in exons or 'U' in exons or 'ENSG' in exons or 'I' in exons:
            mapping[uid] = False
        else:
            exonlist = dict_exonlist[ensg]
            exonstring = '|'.join(exonlist)
            e1,e2 = exons.split('-')
            pattern1 = re.compile(r'^{}\|{}\|'.format(e1,e2))  # ^E1.1|E2.3|
            pattern2 = re.compile(r'\|{}\|{}$'.format(e1,e2))  # |E1.1|E2.3$
            pattern3 = re.compile(r'\|{}\|{}\|'.format(e1,e2)) # |E1.1|E2.3|
            if re.search(pattern3,exonstring) or re.search(pattern2,exonstring) or re.search(pattern1,exonstring):   # as long as match one pattern, should be eliminated
                mapping[uid] = True
            else:
                mapping[uid] = False
    return mapping








[docs]class JunctionCountMatrixQuery():

    '''
    Instantiate the JunctionCountMatrixQuery class

    :param junction_count_matrix: The pandas dataframe for the junction_count_matrix, outputted by AltAnalyze
    :param cores: int, how many cores you'd like to use, if None, then will automatically detect the maximum amount of cores in the OS
    :param add_control: None or a dictionary containing additional controls, additional controls can a dataframe or anndata, for instance, if adding two controls asides from internal GTEx,
                        {'tcga_matched_control':df,'gtex_skin':adata}, when added database contains multiple tissue types, it is recommended to pass as a AnnData with the tissue
                        information stored as adata.var['tissue'], otherwise, if passed as a dataframe, or samples will be recognized as a single tissue type, it will affect tumor specifcity
                        score calculation if using MLE or bayesian hierarchical models.
    :param not_in_db: boolean, whether to remove junctions that are present in any Ensembl documented transcript, remember some of the documented transcript in
                      Ensembl are tumor-specific as well, doing so may remove some bona fide hits. But good for reducing number of neoantigens for validation.
    :param outdir: string, the output folder for storing results
    :param filter_mode: string, either 'maxmin' or 'prevalance', this correspond to the parameters that will be used in snaf.initialize, if maxmin, use t_min and n_max, 
                        if prevalance, use normal and tumor cutoff

    Example::

        jcmq = JunctionCountMatrixQuery(junction_count_matrix=df,cores=50,add_control=control_df,not_in_db=True)
        
    '''

    def __init__(self,junction_count_matrix,cores=None,add_control=None,not_in_db=False,outdir='.',filter_mode='maxmin'):
        self.junction_count_matrix = junction_count_matrix
        if cores is None:
            cores = mp.cpu_count()
        self.cores = cores
        if not_in_db:
            self.get_neojunctions(add_control=add_control,dict_exonlist=dict_exonlist,outdir=outdir,filter_mode=filter_mode)
        else:
            self.get_neojunctions(add_control=add_control,dict_exonlist=None,outdir=outdir,filter_mode=filter_mode)
        

    def __str__(self):
        try:
            len_translated = len(self.translated)
        except AttributeError:
            len_translated = None
        try:
            shape_cond_subset_df = self.cond_subset_df.shape
        except AttributeError:
            shape_cond_subset_df = None
        try:
            len_results = len(self.results)
        except AttributeError:
            len_results = None
        return 'junction_count_matrix: {}\n'\
               'cores: {}\n'\
               'valid: {}\n'\
               'invalid: {}\n'\
               'cond_df: {}\n'\
               'subset: {}\n'\
               'translated: list of {} nj objects\n'\
               'cond_subset_df: {}\n'\
               'results: list of length {}'.format(self.junction_count_matrix.shape,self.cores,len(self.valid),len(self.invalid),
                                               self.cond_df.shape,self.subset.shape,len_translated,shape_cond_subset_df,len_results)
    
    def get_neojunctions(self,add_control,dict_exonlist,outdir,filter_mode):
        self.valid, self.invalid, self.cond_df = multiple_crude_sifting(self.junction_count_matrix,add_control,dict_exonlist,outdir,filter_mode)
        self.subset = self.junction_count_matrix.loc[self.valid,:]
        self.cond_subset_df = self.cond_df.loc[self.valid,:]

    def get_neojunction_info(self,event):  # more for B cell antigen, since we design it in a way that B cell pipeline completely rely on T pipeline for GTEx check
        ed = self.subset.loc[event,:].to_dict()
        freq = np.count_nonzero(np.array(list(ed.values())))/len(ed)
        return ed,freq


[docs]    @staticmethod
    def get_membrane_tuples(df,**kwargs):
        '''
        this function is used by SurfaceAntigen pipeline to filter out splicing evnets that are not tumor-specific and also compute
        useful informations for each membrane protein.

        :param df: pandas dataframe, the junction count matrix
        :param **kwargs: will be passed to __init__ of JunctionCountMatrixQuery class function

        :return membrane_tuples: a list, in which each item is a tuple (uid,mean_gtex,df_gtex,ed,freq). 

            * uid: the uid of the splicing evnet
            * mean_gtex: the mean raw read count across GTEx
            * df_gtex: a dataframe of gtex sample name, tissue types and read count values
            * ed: expression dictionary for tumor samples
            * freq: expression frequency for tumor samples

        Example::

            membrane_tuples = snaf.JunctionCountMatrixQuery.get_membrane_tuples(df)

        '''
        from .surface import filter_to_membrane_protein
        jcmq = JunctionCountMatrixQuery(junction_count_matrix=df,**kwargs)
        print(jcmq)
        neojunctions = jcmq.valid
        membrane_uid = filter_to_membrane_protein(neojunctions)
        membrane_tuples = []
        for uid in membrane_uid:
            mean_gtex, df_gtex = tumor_specificity(uid,method='mean',return_df=True)
            ed, freq = jcmq.get_neojunction_info(uid)
            membrane_tuples.append((uid,mean_gtex,df_gtex,ed,freq))
        return membrane_tuples



    @staticmethod    
    def split_df_to_chunks(df,cores=None):
        df_index = np.arange(df.shape[0])
        if cores is None:
            cores = mp.cpu_count()
        sub_indices = np.array_split(df_index,cores)
        sub_dfs = [df.iloc[sub_index,:] for sub_index in sub_indices]
        return sub_dfs

    @staticmethod
    def split_array_to_chunks(array,cores=None):
        if not isinstance(array,list):
            raise Exception('split_array_to_chunks function works for list, not ndarray')
        array_index = np.arange(len(array))
        if cores is None:
            cores = mp.cpu_count()
        sub_indices = np.array_split(array_index,cores)
        sub_arrays = []
        for sub_index in sub_indices:
            item_in_group = []
            for i in sub_index:
                item_in_group.append(array[i])
            sub_arrays.append(item_in_group)
        return sub_arrays
        
            
    @staticmethod
    def each_chunk_func(input_,kind,hlas=None,strict=False,sub_cond_df=None,binding_method=None):
        if kind == 1:
            nj_list = []
            for uid in input_.index:
                nj = NeoJunction(uid=uid,count=50,check_gtex=False)
                nj.detect_type()
                nj.retrieve_junction_seq()
                nj.in_silico_translation(strict=strict)    
                nj_list.append(nj) 
        elif kind == 2:   # out of development
            nj_list = []
            for uid in input_.index:
                nj = NeoJunction(uid=uid,count=50,check_gtex=False)
                nj.detect_type()
                nj.retrieve_junction_seq()
                nj.in_silico_translation()    
                nj.binding_prediction()
                nj.immunogenicity_prediction()
                nj_list.append(nj)  
        elif kind == 3:  # currently recommended
            nj_list = []
            assert len(input_) == sub_cond_df.shape[0]
            for nj,index in tqdm(zip(input_,range(sub_cond_df.shape[0])),total=len(input_)):
                cond_row = sub_cond_df.iloc[index].tolist()
                hlas_row = list(compress(hlas,cond_row))
                combined_unique_hlas = reduce(lambda a,b:list(set(a+b)),hlas_row)
                try:
                    nj.binding_prediction(hlas=combined_unique_hlas,binding_method=binding_method)
                except Exception as e:
                    if str(e) == 'Already no candidates after in-silico translation':
                        nj_list.append(None)
                        continue
                    else:
                        raise Exception('binding prediction error: {}'.format(e))
                try:
                    nj.immunogenicity_prediction()
                except Exception as e:
                    if str(e) == 'Already no candidates after binding prediction':
                        nj_list.append(None)
                        continue               
                    else:
                        raise Exception('immunogenic prediction error: {}'.format(e))     
                nj_list.append(nj) 
        elif kind == 4:
            nj_list = []
            for nj in input_:
                if nj is None:
                    nj_list.append(None)
                    continue
                try:
                    nj.binding_prediction(hlas=hlas)
                except Exception as e:
                    nj_list.append(None)
                    continue
                try:
                    nj.immunogenicity_prediction()
                except Exception as e:
                    nj_list.append(None)
                    continue
                nj_list.append(nj)                 
        return nj_list

[docs]    def run(self,hlas,strict=False,outdir='.',name='after_prediction.p'):
        '''
        main function to run mhc bound peptide T antigen pipeline

        :param hlas: list, each item is a list of 6 HLA types associated with each sample, the order of this list must be consistent with the sample name in the df column
        :param strict: boolean, default is False, determine whether to filter out peptide which doesn't have canonical start codon support
        :param outdir: string, the path where all the results will go into
        :param name: string, the name of the generated pickle result object

        Example::

            jcmq.run(hlas=hlas,outdir='result',name='after_prediction.p')
        '''
        self.parallelize_run(kind=1,strict=strict)
        print(self)
        self.parallelize_run(kind=3,hlas=hlas)
        self.serialize(outdir=outdir,name=name)

[docs]    @staticmethod
    def generate_results(path,outdir,criterion=None):
        '''
        wrapper function to automatically generate all necessary output

        :param path: string, where the pickle result file lie on the disk
        :param outdir: string, where to generate all the necessary result

        Example::

            JunctionCountMatrixQuery.generate_results(path='result/after_prediction.p',outdir='result')

        '''

        jcmq = JunctionCountMatrixQuery.deserialize(name=path)
        stage0_compatible_results(jcmq,outdir=outdir)
        for stage in [3,2,1]:
            jcmq.show_neoantigen_burden(outdir=outdir,name='burden_stage{}.txt'.format(stage),stage=stage,verbosity=1,contain_uid=False,criterion=criterion)
            jcmq.show_neoantigen_frequency(outdir=outdir,name='frequency_stage{}.txt'.format(stage),stage=stage,verbosity=1,contain_uid=False,plot=True,plot_name='frequency_stage{}.pdf'.format(stage),criterion=criterion)
            jcmq.show_neoantigen_frequency(outdir=outdir,name='frequency_stage{}_verbosity1_uid.txt'.format(stage),stage=stage,verbosity=1,contain_uid=True,plot=False,criterion=criterion)
            # add additional attributes
            df = pd.read_csv(os.path.join(outdir,'frequency_stage{}_verbosity1_uid.txt'.format(stage)),sep='\t',index_col=0)
            enhance_frequency_table(df,True,True,outdir,'frequency_stage{}_verbosity1_uid_gene_symbol_coord_mean_mle.txt'.format(stage))
            # report candidates
            if stage == 3:
                dff = pd.read_csv(os.path.join(outdir,'frequency_stage{}_verbosity1_uid_gene_symbol_coord_mean_mle.txt'.format(stage)),sep='\t',index_col=0)
                for sample in tqdm(jcmq.junction_count_matrix.columns,total=jcmq.junction_count_matrix.shape[1]):
                    report_candidates(jcmq,dff,sample,os.path.join(outdir,'T_candidates'),True)
                print('concatenating all T antigen files into one & indicate whether in AltAnalyze database')
                df_list = []
                for sample in jcmq.junction_count_matrix.columns:
                    df_list.append(pd.read_csv(os.path.join(outdir,'T_candidates','T_antigen_candidates_{}.txt'.format(sample)),sep='\t',index_col=0))
                final_df = pd.concat(df_list,axis=0)
                all_uid = final_df['uid'].unique().tolist()
                mapping = is_in_db(all_uid)
                final_df['in_db'] = final_df['uid'].map(mapping).values
                final_df.to_csv(os.path.join(outdir,'T_candidates','T_antigen_candidates_all.txt'),sep='\t')

        # add additional attributes to stage0
        df = pd.read_csv(os.path.join(outdir,'frequency_stage0.txt'),sep='\t',index_col=0)
        df.index = [','.join([item,item]) for item in df.index]
        df.to_csv(os.path.join(outdir,'frequency_stage0_verbosity1_uid.txt'),sep='\t')
        df = pd.read_csv(os.path.join(outdir,'frequency_stage0_verbosity1_uid.txt'),sep='\t',index_col=0)
        enhance_frequency_table(df,True,True,outdir,'frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt')        

    def parallelize_run(self,kind,hlas=None,strict=False):
        pool = mp.Pool(processes=self.cores)
        if kind == 1 or kind == 2:
            sub_dfs = JunctionCountMatrixQuery.split_df_to_chunks(self.subset,self.cores)
            r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(sub_df,kind,None,strict,)) for sub_df in sub_dfs]
            pool.close()
            pool.join()
            results = []
            for collect in r:
                result = collect.get()
                results.extend(result)
            self.translated = results
        elif kind == 3:
            sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.translated,self.cores)
            sub_cond_dfs = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores)
            r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(sub_array,kind,hlas,False,sub_cond_df,binding_method,)) for sub_array,sub_cond_df in zip(sub_arrays,sub_cond_dfs)]
            pool.close()
            pool.join()
            results = []
            for collect in r:
                result = collect.get()
                results.extend(result)
            self.results = (results,hlas)


        elif kind == 4: # heterogeneous hlas for each sample
            self.cond_subset_df = self.cond_df.loc[self.valid,:]
            need_to_predict_across_samples = []  # [[nj,None,nj,nj,None...],]
            for i,(label,content) in enumerate(self.cond_subset_df.iteritems()):
                need_to_predict = []
                number_need = 0
                for item in zip(self.translated,content.values):
                    if item[1]:
                        need_to_predict.append(item[0])
                        number_need += 1
                    else:
                        need_to_predict.append(None)
                need_to_predict_across_samples.append(need_to_predict)
                print('{} has {} junctions to proceed predictions, HLA are {}'.format(label,number_need,hlas[i]))
            print('Binding and immunogenicity prediction starts')
            r = [pool.apply_async(func=JunctionCountMatrixQuery.each_chunk_func,args=(need_to_predict,kind,hlas,)) for need_to_predict,hlas in zip(need_to_predict_across_samples,hlas)]
            pool.close()
            pool.join()
            results = []
            for collect in r:
                result = collect.get()
                results.append(result)
            self.results = results 

    def serialize(self,outdir,name):
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        with open(os.path.join(outdir,name),'wb') as f:
            pickle.dump(self,f,protocol=pickle.HIGHEST_PROTOCOL)  

    @staticmethod
    def deserialize(name):
        with open(name,'rb') as f:
            jcmq = pickle.load(f)
        return jcmq

[docs]    def visualize(self,uid,sample,outdir,tumor=False,criterion=[('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]):
        '''
        Visualize certain Neojunction in certain sample

        :param uid: string, the uid for the event you want to check
        :param sample: string, the sample name
        :param outdir: string, where to deposite the figure
        :param tumor: bool, whether to show the expression level in tumor sample as well, default is not
        :param criterion: a nested tuple, if none of the neoantigens are bound and immunogenic, no candidates will be plotted, so you can relax the criterion a bit based on the format specified.

        Example::

            jcmq.visualize(uid='ENSG00000167291:E38.6-E39.1',sample='TCGA-DA-A1I1-06A-12R-A18U-07.bed',outdir='./result')
        '''
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        if sample is not None:
            row_index = self.subset.index.tolist().index(uid)
            col_index = self.subset.columns.tolist().index(sample)
            results = self.results[0]
            hlas = self.results[1]
            nj = deepcopy(results[row_index])
            nj.enhanced_peptides = nj.enhanced_peptides.filter_based_on_hla(selected_hla=hlas[col_index])
            nj.visualize(outdir,'{}_{}.pdf'.format(uid.replace(':','_'),sample),criterion=criterion)
        if tumor:
            # in tumor sample
            fig,ax = plt.subplots()
            counts = self.junction_count_matrix.loc[uid,:]
            ax.plot(np.arange(len(counts)),counts.values,marker='o',markersize=3,color='k',markerfacecolor='r',markeredgecolor='r',linestyle='--')
            ax.set_xticks(np.arange(len(counts)))
            ax.set_xticklabels(counts.index.values,fontsize=1,rotation=90)
            ax.set_title('{}_tumor'.format(uid),fontsize=8)
            ax.set_ylim(bottom=-0.05)
            ax.set_ylabel('counts')
            plt.savefig(os.path.join(outdir,'{}_tumor.pdf'.format(uid.replace(':','_'))))
            plt.close()

    def show_neoantigen_burden(self,outdir,name,stage,verbosity,contain_uid,criterion=None):
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores)
        sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores)
        hlas = self.results[1]
        pool = mp.Pool(processes=self.cores)

        r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_burden_single_run,args=(sub_array,sub_cond,hlas,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)]
        pool.close()
        pool.join()
        results = []
        for collect in r:
            result = collect.get()
            results.append(result)
        
        burden_valid = pd.concat(results,axis=0)
        burden_valid.index = self.subset.index
        burden_valid.columns = self.subset.columns

        burden_invalid = self.junction_count_matrix.loc[self.invalid,:].copy()
        for col in burden_invalid.columns:
            burden_invalid.loc[:,col] = np.full(burden_invalid.shape[0],0)
        
        burden = pd.concat([burden_valid,burden_invalid],axis=0)

        # statistics
        burden['mean'] = burden.mean(axis=1)
        burden.loc['burden'] = burden.sum(axis=0)
        burden.sort_values(by='mean',axis=0,ascending=False,inplace=True)
        burden.sort_values(by='burden',axis=1,ascending=True,inplace=True)
        # reorder a bit
        c_list = burden.columns.tolist()
        c_list.remove('mean')
        c_list.append('mean')
        i_list = burden.index.tolist()
        i_list.remove('burden')
        i_list.append('burden')
        burden = burden.loc[i_list,c_list]
        burden.to_csv(os.path.join(outdir,name),sep='\t')    

    def show_neoantigen_frequency(self,outdir,name,stage,verbosity,contain_uid,plot,plot_name=None,yscale='linear',criterion=None):
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores)
        sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores)
        hlas = self.results[1]
        column_names = self.subset.columns
        pool = mp.Pool(processes=self.cores)

        r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_frequency_single_run,args=(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)]
        pool.close()
        pool.join()
        results = []
        for collect in r:
            result = collect.get()
            results.append(result)  

        # let's define another function for merging dictionary, but not overwrite the values
        def merge_key_and_values(dic1,dic2):
            return {x:list(set(dic1.get(x,[]) + dic2.get(x,[]))) for x in set(dic1).union(set(dic2))}         

        dic = reduce(merge_key_and_values,results) 

        # statistics
        df = pd.Series(data=dic).to_frame(name='samples')
        df['n_sample'] = df.apply(lambda x:len(set(x[0])),axis=1).values
        df.sort_values(by='n_sample',ascending=False,inplace=True)
        df.to_csv(os.path.join(outdir,name),sep='\t')
        # plot
        if plot:
            fig,ax = plt.subplots()
            ax.bar(x=np.arange(df.shape[0]),height=df['n_sample'].values,edgecolor='k')
            ax.set_xlabel('Neoantigen rank by its occurence (descending order)')
            ax.set_ylabel('Occurence (n_sample)')
            ax.set_title('Neoantigen Occurence')
            plt.savefig(os.path.join(outdir,'x_neoantigen_{}'.format(plot_name)),bbox_inches='tight')
            plt.close()
            fig,ax = plt.subplots()
            try:
                sns.histplot(df['n_sample'].values,binwidth=1,kde=True,ax=ax)
                ax.set_yscale(yscale)
                plt.savefig(os.path.join(outdir,'x_occurence_{}'.format(plot_name)),bbox_inches='tight')
                plt.close()
            except ValueError:
                print('All neoantigens are present in same amount of patients, seaborn histplot can not properly figure out binedge, no occurence plot generated')





[docs]    def show_neoantigen_as_fasta(self,outdir,name,stage,verbosity,contain_uid,sample=None,criterion=None):
        '''
        write the neoantigen as a fasta file for MS validation or other purpose

        :param outdir: string ,the output directory for the generated fasta file
        :param name: string, the name of the output fasta file
        :param stage: int, either 0,1,2,3, the stage of neoantigen you want to report
        :param verbosity: int, either 1,2,3, the verbosity of candidates
        :param contain_uid: boolean, whether you want to contain the junction UID along with the neoantigen
        :param sample: string, the name of the sample in which you want to extract the neoantigen
        :param criterion: nested list, the criterion for filtering out the candidate

        Examples::

            jcmq = snaf.JunctionCountMatrixQuery.deserialize('result/after_prediction.p')
            sample = 'SRR5933735.Aligned.sortedByCoord.out'
            jcmq.show_neoantigen_as_fasta(outdir='./fasta',name='neoantigen_{}.fasta'.format(sample),stage=2,verbosity=1,contain_uid=True,sample=sample)

        '''
        if not os.path.exists(outdir):
            os.mkdir(outdir)
            
        sub_arrays = JunctionCountMatrixQuery.split_array_to_chunks(self.results[0],self.cores)
        sub_conds = JunctionCountMatrixQuery.split_df_to_chunks(self.cond_subset_df,self.cores)
        hlas = self.results[1]
        column_names = self.subset.columns
        pool = mp.Pool(processes=self.cores)

        r = [pool.apply_async(func=JunctionCountMatrixQuery.show_neoantigen_frequency_single_run,args=(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion,)) for sub_array,sub_cond in zip(sub_arrays,sub_conds)]
        pool.close()
        pool.join()
        results = []
        for collect in r:
            result = collect.get()
            results.append(result)  

        # let's define another function for merging dictionary, but not overwrite the values
        def merge_key_and_values(dic1,dic2):
            return {x:list(set(dic1.get(x,[]) + dic2.get(x,[]))) for x in set(dic1).union(set(dic2))}         

        dic = reduce(merge_key_and_values,results) 
        # statistics
        df = pd.Series(data=dic).to_frame(name='samples')
        df['n_sample'] = df.apply(lambda x:len(set(x[0])),axis=1).values
        df.sort_values(by='n_sample',ascending=False,inplace=True)
        # subset
        if sample is not None:
            df = df.loc[df.apply(lambda x:sample in x['samples'], axis=1),:] 
            name = 'neoantigen_' + sample + '.fasta'
        # write to fasta
        with open(os.path.join(outdir,name),'w') as f:
            for row in df.itertuples():
                if not contain_uid:   # as the row.Index is a tuple
                    tmp = [str(item) for item in row.Index]
                    f.write('>{}|{}\n'.format('|'.join(tmp),row.n_sample)) 
                    f.write('{}\n'.format(row.Index[0]))   
                else:   # as the row.Index is a string delimited by comma
                    f.write('>{}|{}\n'.format(row.Index.replace(',','|'),row.n_sample)) 
                    f.write('{}\n'.format(row.Index.split(',')[0]))
                


    # let's define an atomic function inside here
    @staticmethod
    def show_neoantigen_burden_single_run(sub_array,sub_cond,hlas,stage,verbosity,contain_uid,criterion=None):
        nj_burden_2d = []
        for nj,index in zip(sub_array,range(sub_cond.shape[0])):
            nj_burden = []
            cond_row = sub_cond.iloc[index].values
            for hla,cond in zip(hlas,cond_row):
                if nj is None or not cond :
                    nj_burden.append(0)
                else:
                    nj_copy = deepcopy(nj)
                    nj_copy.enhanced_peptides = nj_copy.enhanced_peptides.filter_based_on_hla(selected_hla=hla)
                    nj_copy.derive_candidates(stage=stage,verbosity=verbosity,contain_uid=contain_uid,criterion=criterion)
                    nj_burden.append(len(nj_copy.candidates))
            nj_burden_2d.append(nj_burden)  
        df = pd.DataFrame(data=nj_burden_2d)
        return df

    # let's define the single function to run in each subprocess
    @staticmethod
    def show_neoantigen_frequency_single_run(sub_array,sub_cond,hlas,column_names,stage,verbosity,contain_uid,criterion=None):
        dic = {}
        for nj,index in zip(sub_array,range(sub_cond.shape[0])):
            cond_row = sub_cond.iloc[index].values
            for hla,column_name,cond in zip(hlas,column_names,cond_row):
                if nj is not None and cond:
                    nj_copy = deepcopy(nj)
                    nj_copy.enhanced_peptides = nj_copy.enhanced_peptides.filter_based_on_hla(selected_hla=hla)
                    nj_copy.derive_candidates(stage=stage,verbosity=verbosity,contain_uid=contain_uid,criterion=criterion)
                    for cand in nj_copy.candidates:
                        try:
                            dic[cand].append(column_name)
                        except KeyError:
                            dic[cand] = []
                            dic[cand].append(column_name)       
        return dic   

class EnhancedPeptides():
    def __init__(self,peptides,hlas,kind):
        # kind means how to instantiate the EnhancedPeptides object
        if kind == 2:  # hla-reduced enhanced peptide, this mode is used when filtsre_based_on_hla
            self.mers = peptides
            self.info = hlas
        else:
            self.mers = []
            self.info = []
            for k,v in peptides.items():
                self.mers.append(k)
                phlas = {}  # {pep1:{origin:(extra,n_from_first,phase,evidences),hla1:{},hla2:{}}}
                for pep in v:
                    if kind == 0:  # each pep is (pep,extra,n_from_first,phase,evidences), this mode is used before binding_prediction
                        pairs = {}
                        pairs['origin'] = (pep[1],pep[2],pep[3],pep[4])
                        for hla in hlas:
                            pairs[hla] = {}
                        phlas[pep[0]] = pairs
                    elif kind == 1:   # each pep is (pep,extra,n_from_first,hla,phase,evidences), this mode is used when filter_based_on_criterion
                        try:
                            phlas[pep[0]]['origin'] = (pep[1],pep[2],pep[4],pep[5])
                        except KeyError:
                            phlas[pep[0]] = {}
                            phlas[pep[0]]['origin'] = (pep[1],pep[2],pep[4],pep[5])
                        phlas[pep[0]][pep[3]] = {}
                self.info.append(phlas)

    def __str__(self):
        return str(self.info)

    def __getitem__(self,key):
        index = self.mers.index(key)
        return self.info[index]

    def is_empty(self):
        total = 0
        for dic in self.info:
            total += len(dic)
        if total == 0:
            return True
        else:
            return False

    def simplify_to_list(self,verbosity):   # make sure to first filter and re-instantiate
        if verbosity==1:  # only peptide, result is also tuple
            result_list = []
            for mer in self.mers:
                result_list.extend(list(self[mer].keys()))   # utlize __getitem__ method, so self[9] instead of self[0]
            result_list = [(item,) for item in result_list]
        elif verbosity==2:  # peptide and hla
            result_list = []
            for mer in self.mers:
                for pep,hla_complex in self[mer].items():
                    for hla in hla_complex.keys():
                        if hla != 'origin':
                            result_list.append((pep,hla))
        elif verbosity==3:
            result_list = []
            for mer in self.mers:
                for pep,hla_complex in self[mer].items():
                    extra,n_from_first,phase,evidences = hla_complex['origin']
                    for hla in hla_complex.keys():
                        if hla != 'origin':
                            result_list.append((pep,hla,extra,n_from_first))
        return result_list

    def register_attr(self,df,attr_name):
        '''
        df should follow:
          peptide     mer     hla       score    identity
         AAAAAAAAA     9   HLA-A*01:01   0.3       SB        
        '''
        for mer,sub_df in df.groupby(by='mer'):
            index = self.mers.index(int(mer))
            for peptide,sub_sub_df in sub_df.groupby(by='peptide'):
                for row in sub_sub_df.itertuples(index=False):
                    self.info[index][peptide][row.hla][attr_name] = (float(row.score),str(row.identity))

    def filter_based_on_hla(self,selected_hla,reinstantiate=True):
        new_mers = []
        new_info = []
        selected_hla = set(selected_hla)
        for i,k in enumerate(self.mers):
            new_info_container = {}
            for pep,hla_complex in self.info[i].items():
                reduced_hla_complex = {}
                for hla,attrs in hla_complex.items():
                    if hla == 'origin':
                        reduced_hla_complex['origin'] = attrs
                    else:
                        if hla in selected_hla:
                            reduced_hla_complex[hla] = attrs
                new_info_container[pep] = reduced_hla_complex
            new_mers.append(k)
            new_info.append(new_info_container)
        if reinstantiate:
            ep = EnhancedPeptides(peptides=new_mers,hlas=new_info,kind=2)
        return ep
                

        


    def filter_based_on_criterion(self,criteria,reinstantiate=True):
        # criterion: [(net),], ['netMHCpan_el',1,==,SB]
        peptides = {k:[] for k in self.mers}
        for i,k in enumerate(self.mers):
            for pep,hla_complex in self.info[i].items():
                extra,n_from_first,phase,evidences = hla_complex['origin']
                for hla,attrs in hla_complex.items():
                    if hla == 'origin':
                        continue
                    boolean_list = []
                    for criterion in criteria:
                        if criterion[1] == 1:   # matched is not a number
                            eval_string = 'attrs[\'{}\'][{}] {} \'{}\''.format(criterion[0],criterion[1],criterion[2],criterion[3])
                        elif criterion[1] == 0:   # matched is a number
                            eval_string = 'attrs[\'{}\'][{}] {} {}'.format(criterion[0],criterion[1],criterion[2],criterion[3])
                        try:
                            boolean = eval(eval_string)                            
                        except KeyError:
                            boolean = False
                        boolean_list.append(boolean)
                    boolean_final = all(boolean_list)
                    if boolean_final:
                        peptides[k].append((pep,extra,n_from_first,hla,phase,evidences))   # if not reinstantiate, this format is called reduced form
        if reinstantiate:
            return EnhancedPeptides(peptides,None,1)
        else:
            return peptides


class NeoJunction():
    def __init__(self,uid,count,check_gtex):
        if check_gtex:
            NeoJunction.is_neojunction(uid,count)
        self.uid = uid
        self.count = count

    def __str__(self):
        try:
            ts = self.tumor_specificity
        except AttributeError:
            ts = None
        try:
            et = self.event_type
        except AttributeError:
            et = None
        try:
            j = self.junction[:5] + '...' + self.junction[-5:]
        except AttributeError:
            j = None
        try:
            peptides = {k:len(v) for k,v in self.peptides.items()}
        except AttributeError:
            peptides = None
        try:
            ep = {i:len(self.enhanced_peptides[i]) for i in self.enhanced_peptides.mers}
        except AttributeError:
            ep = None
        try:
            cand_len = len(self.candidates)
            cand_example = self.candidates[0]
        except AttributeError:
            cand_len,cand_example = None,None
        return 'uid: {}\n'\
               'count: {}\n'\
               'tumor specificity: {}\n'\
               'event type: {}\n'\
               'junction: {}\n'\
               'peptides: {}\n'\
               'Enhanced peptides: {}\n'\
               'Candidates: len is {}, first is {}\n'.format(self.uid,self.count,ts,et,j,peptides,ep,cand_len,cand_example)



    @staticmethod
    def is_neojunction(uid,count):
        identity,detail = crude_tumor_specificity(uid,count)
        if not identity:
            raise Exception('This is not a NeoJunction, instantiation fails')

    def infer_tumor_specificity(self,method):
        tumor_specificity = accurate_tumor_specificity(self.uid,method)
        self.tumor_specificity = tumor_specificity
        return tumor_specificity

    def gtex_viewer(self,kind,outdir):
        if kind == 1:   # line plot, all tissues, count (norm or not)
            gtex_visual_count(self.uid,norm=True,out_folder=outdir)
        elif kind == 2: # hist plot, combined, norm count
            gtex_visual_norm_count_combined(self.uid,out_folder=outdir)
        elif kind == 3:  # hist plot, per tissue, number of samples that are non-zero in each tissue
            gtex_visual_per_tissue_count(self.uid,out_folder=outdir)

    
    def detect_type(self):
        '''
        Ordinary: ENSG00000107902:E10.1-E12.1
        Alt3: ENSG00000110057:E5.1-E6.2_67996641
        Alt5: ENSG00000100321:E7.1_39364266-E8.1
        Intron Retention: ENSG00000115524:I4.1-E5.1
        Novel Exon: ENSG00000008441:I40.1_13076665-E41.1
        Trans-splicing: ENSG00000196565:E14.2-ENSG00000213934:E3.1
        UTR Event: ENSG00000164068:U0.1_49689185-E2.1
        '''
        valid_pattern = re.compile(r'^ENSG\d+:.+?-.+')
        if re.search(valid_pattern,self.uid):   # at least valid one
            if len(re.findall('ENSG',self.uid)) == 2:
                event_type = 'trans_splicing'
            elif 'U' in self.uid:
                event_type = 'utr_event'
            elif '_' in self.uid:
                subexon12 = self.uid.split(':')[1]
                subexon1, subexon2 = subexon12.split('-')
                if 'I' in subexon12:
                    event_type = 'novel_exon'
                elif '_' in subexon1 and '_' in subexon2:
                    event_type = 'alt5_alt3'
                elif '_' in subexon1 and '_' not in subexon2:
                    event_type = 'alt5'
                elif '_' in subexon2 and '_' not in subexon1:
                    event_type = 'alt3'
                else:
                    event_type = 'invalid'
            elif 'I' in self.uid:
                event_type = 'intron_retention'
            elif re.search(r'^ENSG\d+:E\d+\.\d+-E\d+\.\d+$',self.uid):
                e = self.uid.split(':')[1]
                e1 = e.split('-')[0]
                e2 = e.split('-')[1]
                e1_int = int(e1.split('.')[0][1:])
                e1_frac = int(e1.split('.')[1])
                e2_int = int(e2.split('.')[0][1:])
                e2_frac = int(e2.split('.')[1])                
                if e1 == e2:   # E5.1-E5.1
                    event_type = 'invalid'
                else:
                    if e1_int > e2_int or (e1_int==e2_int and e1_frac>e2_frac):   # E13.1-E12.4   or E12.10-E12.9
                        event_type = 'invalid'
                    else:
                        event_type = 'ordinary'
            else:
                event_type = 'invalid'
        else:
            event_type = 'invalid'
        self.event_type = event_type
        return event_type

    def retrieve_junction_seq(self):
        if self.event_type != 'invalid':
            ensid = self.uid.split(':')[0]
            subexon1,subexon2 = ':'.join(self.uid.split(':')[1:]).split('-')
            code = is_consecutive(subexon1,subexon2)
            seq1 = subexon_tran(subexon1,ensid,'site1',code)
            seq2 = subexon_tran(subexon2,ensid,'site2',code)
            junction = ','.join([seq1,seq2])
            self.junction = junction
        else:
            self.junction = '$' * 10   # indicating invalid uid


    def in_silico_translation(self,ks=[9,10],strict=False):
        peptides = {k:[] for k in ks}
        coord = uid_to_coord(self.uid)  # chr1:555-666(+)
        if '$' not in self.junction and '*' not in self.junction and '#' not in self.junction and 'unknown' not in coord:
            first,second = self.junction.split(',')
            # annotate peptide based on starting codon evidence
            ensg = self.uid.split(':')[0]
            strand = coord.split('(')[1].rstrip(')')
            if strand == '+':
                coord_first_exon_last_base = coord.split(':')[1].split('-')[0]
            elif strand == '-':
                coord_first_exon_last_base = coord.split(':')[1].split('(')[0].split('-')[1]
            possible_start_codon_coord = dict_start_codon.get(ensg,[])   # [333,444] or []
            support_phases_dict = {}   # {0:[(333,enst,+)]} or {}
            for pssc in possible_start_codon_coord:
                supports = get_support_phase(ensg,coord_first_exon_last_base,pssc,strand,len(first))
                for (phase_, pssc_, enst_, strand_) in supports:
                    support_phases_dict.setdefault(phase_,[]).append((pssc_,enst_,strand_))
            for phase in [0,1,2]:  # tranlation starts with index "phase"
                evidences = tuple(support_phases_dict.get(phase,[]))
                if strict and len(evidences)==0:  # in strict mode, without start_codon evidence, let's skip this phase
                    continue
                de_facto_first = first[phase:]
                pep_dict = get_peptides(de_facto_first,second,ks,phase,evidences)   # (pep,extra,n_from_first,phase,evidences)
                for k in ks:  
                    peptides[k].extend(pep_dict[k])
                    peptides[k] = list(set(peptides[k]))
        self.peptides = peptides
        return peptides

    def binding_prediction(self,hlas,binding_method=None):
        hlas = list(set(hlas))
        ep = EnhancedPeptides(self.peptides,hlas,0)
        if ep.is_empty():
            raise Exception('Already no candidates after in-silico translation')
        for k,v in self.peptides.items():
            try:
                v = list(zip(*v))[0]
            except IndexError:
                continue
            if binding_method == 'netMHCpan':
                df = run_netMHCpan(software_path,v,hla_formatting(hlas,'netMHCpan_output','netMHCpan_input'),k)
            elif binding_method == 'MHCflurry':
                df = run_MHCflurry(v,hla_formatting(hlas,'netMHCpan_output','deepimmuno'))
                df['hla'] = hla_formatting(df['hla'].tolist(),'deepimmuno','netMHCpan_output')
            ep.register_attr(df,'netMHCpan_el')
        self.enhanced_peptides = ep


    def immunogenicity_prediction(self):
        reduced = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),],False)
        ep = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),],True)
        if ep.is_empty():
            raise Exception('Already no candidates after binding prediction')
        for k,v in reduced.items():
            try:
                v_pep,v_hla = list(zip(*v))[0],list(zip(*v))[3]
            except IndexError:  # one of the mer, no candicate, but the other mer has, so pass the empty check
                continue
            data = np.column_stack((v_pep,v_hla))
            df_input = pd.DataFrame(data=data)
            df_input[1] = hla_formatting(df_input[1].tolist(),'netMHCpan_output','deepimmuno')
            df_output = run_deepimmuno(df_input)
            df = pd.DataFrame({'peptide':df_output['peptide'].values,'mer':[k]*df_output.shape[0],
                               'hla':hla_formatting(df_output['HLA'].values.tolist(),'deepimmuno','netMHCpan_output'),
                               'score':df_output['immunogenicity'].values,
                               'identity':[True if item > 0.5 else False for item in df_output['immunogenicity'].values]})
            self.enhanced_peptides.register_attr(df,attr_name='deepimmuno_immunogenicity')


    def derive_candidates(self,stage,verbosity,contain_uid=True,criterion=None):
        if stage == 1: # all translated peptides
            self.candidates = self.enhanced_peptides.simplify_to_list(verbosity=verbosity)
        elif stage == 2: # all bound peptides
            self.candidates = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),]).simplify_to_list(verbosity=verbosity)
        elif stage == 3: # immunogenic peptides
            self.candidates = self.enhanced_peptides.filter_based_on_criterion([('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]).simplify_to_list(verbosity=verbosity)
        elif stage == 'custom':
            self.candidates = self.enhanced_peptides.filter_based_on_criterion(criterion).simplify_to_list(verbosity=verbosity)
        if contain_uid:
            new = []
            for item in self.candidates:
                tmp = [str(i) for i in item]
                tmp.append(self.uid)
                new.append(','.join(tmp))
            self.candidates = new





    def visualize(self,outdir,name,criterion=[('netMHCpan_el',0,'<=',2),('deepimmuno_immunogenicity',1,'==','True'),]):
        reduced = self.enhanced_peptides.filter_based_on_criterion(criterion,False) # now each element should add phase and evidences as well
        '''
        {9: [('LPSPPAQEL', 2, 0, 'HLA-B*08:01'), ('LPSPPAQEL', 2, 0, 'HLA-B*08:02'), 
             ('SLYLLLQHR', 1, 2, 'HLA-A*68:01')], 
        10: [('TSLYLLLQHR', 1, 3, 'HLA-A*68:01'), 
             ('TLPSPPAQEL', 2, 1, 'HLA-A*02:01'), ('TLPSPPAQEL', 2, 1, 'HLA-A*24:02'), ('TLPSPPAQEL', 2, 1, 'HLA-B*08:01'), 
             ('TLPSPPAQEL', 2, 1, 'HLA-B*08:02')]}
        '''
        n_axes = 0
        to_draw = []
        for k,v in reduced.items():
            to_draw.extend(v)
            n_axes += len(v)
        ncols = 4
        nrows = n_axes // ncols + 1 + 1
        height_ratios = [0.2 if i == 0 else (1-0.2)/(nrows-1) for i in range(nrows)]
        fig = plt.figure()
        gs = mpl.gridspec.GridSpec(nrows=nrows,ncols=ncols,height_ratios=height_ratios,wspace=0.5,hspace=0.5)
        ax_genome  = fig.add_subplot(gs[0,:])
        axes_list = []
        for i in range(1,gs.nrows):
            for j in range(0,gs.ncols):
                ax = fig.add_subplot(gs[i,j])
                axes_list.append(ax)
        ax_genome = draw_genome(ax_genome,self.uid,dict_exonCoords)
        for i,ax in enumerate(axes_list):
            if i < n_axes:
                # info
                aa, extra, n_from_first, hla, phase, evidences = to_draw[i]
                first,second = self.junction.split(',')
                dna_first = extra + n_from_first * 3
                dna_second = -extra + (len(aa)-n_from_first) * 3
                binding_score = self.enhanced_peptides[len(aa)][aa][hla]['netMHCpan_el'][0]
                immunogenicity_score = self.enhanced_peptides[len(aa)][aa][hla]['deepimmuno_immunogenicity'][0]   
                # draw     
                ax = show_candicates(ax,aa,extra,n_from_first,hla,phase,evidences,first,second,dna_first,dna_second,binding_score,immunogenicity_score) 
            else:
                ax.axis('off') 
        fig.subplots_adjust(top=0.9)             
        fig.suptitle('{} Count:{}'.format(self.uid,self.count))
        plt.savefig(os.path.join(outdir,name),bbox_inches='tight')
        plt.close()

       

# processing functions
def hla_formatting(pre,pre_type,post_type):
    if pre_type == 'netMHCpan_output' and post_type == 'netMHCpan_input':  # HLA-A*01:01 to HLA-A01:01
        post = [hla.replace('*','') for hla in pre]
    elif pre_type == 'netMHCpan_input' and post_type == 'netMHCpan_output':  # HLA-A01:01 to HLA-A*01:01
        post = [hla[:5] + '*' + hla[5:] for hla in pre]
    elif pre_type == 'netMHCpan_output' and post_type == 'deepimmuno':  # HLA-A*01:01 to HLA-A*0101
        post = [hla.replace(':','') for hla in pre]
    elif pre_type == 'deepimmuno' and post_type == 'netMHCpan_output': # HLA-A*0101 to HLA-A*01:01
        post = [hla[:8] + ':' + hla[-2:] for hla in pre]
    return post





def get_peptides(de_facto_first,second,ks,phase,evidences):
    peptides = {k:[] for k in ks}
    extra = len(de_facto_first) % 3  # how many base left in first assuming no stop condon in front of it.
    num = len(de_facto_first) // 3   # how many set of codons in the first.
    aa_first = str(Seq(de_facto_first).translate(to_stop=True))  # if not the multiple of 3, only translate the part that are divisible by 3, it will be a warning for BioPython
    if len(aa_first) == num:  # successfully read through
        if extra == 0:
            continue_second = second
        elif extra == 1:
            continue_second = de_facto_first[-1] + second
        elif extra == 2:
            continue_second = de_facto_first[-2:] + second
        aa_second = str(Seq(continue_second).translate(to_stop=True))
        if len(aa_second) > 0:  # at least, not ''
            for k in ks:
                second_most = min(k,len(aa_second)) # the max allowed number of residue for aa_second
                first_most = len(aa_first)  # the max allowed number of residue for aa_first
                for n_from_second in range(second_most,0,-1):
                    n_from_first = k - n_from_second
                    if n_from_first == 0 and extra == 0:
                        '''
                        cttca cct cac ttt acc ttc tcc tcc agc aca gga act agg aac tac gga gag aga agc caa 
                           S   P   H   F   T   F   S   S   S   T   G   T   R   N   Y   G   E   R   S   Q

                        the comma is between "ttt" and "acc"
                        so, when extra == 0, means whole second will be after the junction, in this case, we have to require 
                        the peptide at least include a amino acid from first part, otherwise, TFSSSTGTR won't be a splice-derived
                        peptide.   

                        this example can be reproduced using: nj = NeoJunction('ENSG00000223572:E15.1-E15.2')                     
                        '''
                        continue
                    if n_from_first <= first_most:
                        if n_from_first > 0:
                            pep = aa_first[-n_from_first:] + aa_second[:n_from_second]
                        elif n_from_first == 0:
                            pep = aa_second[:n_from_second]
                        peptides[k].append((pep,extra,n_from_first,phase,evidences))
    return peptides
                        


def query_from_dict_fa(abs_start,abs_end,EnsID,strand):
    '''
    abs_start and abs_end always means the xth base in forward strand

    the returned exon_seq, however, means the 5'-3' seq depending on the strand information.
    '''
    if strand == '+':        
        start = int(dict_fa[EnsID][1])
        end = int(dict_fa[EnsID][2])
        seq = dict_fa[EnsID][3]
        start_index = int(abs_start) - start + 2000
        end_index = int(abs_end) - start + 1 + 2000
        exon_seq = seq[start_index:end_index]
    
    elif strand == '-':
        start = int(dict_fa[EnsID][1])
        end = int(dict_fa[EnsID][2])
        seq_reverse = dict_fa[EnsID][3]
        seq_forward = str(Seq(seq_reverse).reverse_complement())  # Hs_gene.fa restore the reverse strand info
        start_index = int(abs_start) - start + 2000
        end_index = int(abs_end) - start + 1 + 2000 # endpoint in python is non-inclusive
        exon_seq_1 = seq_forward[start_index:end_index]
        s = Seq(exon_seq_1)
        exon_seq = str(s.reverse_complement())
    return exon_seq

            


def utrAttrs(EnsID):  # try to get U0.1's attribute, but dict_exonCoords doesn't have, so we just wanna get the first entry for its EnsGID
    exonDict = dict_exonCoords[EnsID] 
    attrs = next(iter(exonDict.values()))
    chr_,strand = attrs[0],attrs[1]
    return chr_,strand

def utrJunction(site,EnsGID,strand,chr_,flag,seq_len=100):  # U0.1_438493849, here 438493849 means the site (suffix)
    if flag == 'site1' and strand == '+':  # U0.1_438493849 - E2.2
        otherSite = int(site) - seq_len + 1   # extract UTR with length = 100
        exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site))
    elif flag == 'site1' and strand == '-':    
        otherSite = int(site) + seq_len - 1 
        exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite))
        exon_seq = str(Seq(exon_seq).reverse_complement())
    elif flag == 'site2' and strand == '+':  # E5.3 - U5.4_48374838
        otherSite = int(site) + seq_len -1
        exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite))
    elif flag == 'site2' and strand == '-':
        otherSite = int(site) - seq_len + 1
        exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site))
        exon_seq = str(Seq(exon_seq).reverse_complement())
    return exon_seq

def retrieveSeqFromUCSCapi(chr_,start,end):
    url = 'http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment={0}:{1},{2}'.format(chr_,start,end)
    response = requests.get(url)
    status_code = response.status_code
    assert status_code == 200
    try:
        my_dict = xmltodict.parse(response.content)
    except:
        exon_seq = '#' * 10  # indicating the UCSC doesn't work
        return exon_seq
    exon_seq = my_dict['DASDNA']['SEQUENCE']['DNA']['#text'].replace('\n','').upper()
    return exon_seq

def is_consecutive(subexon1,subexon2):
    # I only care of E3.3-E3.4, other form will automatically being labelled as not consecutive
    # the return code: 0 -> not consecutive, 1 -> consecutive
    code = 0
    pattern1 = re.compile(r'^E(\d{1,3})\.(\d{1,3})$')
    match1 = re.search(pattern1,subexon1)
    if match1:
        captured_group1 = match1.group(1)
        captured_group2 = match1.group(2)
        pattern2 = re.compile(rf'^E{captured_group1}\.{int(captured_group2)+1}$')
        match2 = re.search(pattern2,subexon2)
        if match2:
            code = 1
    return code

def subexon_tran(subexon,EnsID,flag,code):  # flag either site1 or site2
    '''
    1. subexon can take multiple forms depending on the event type
    E1.2 or I3.4
    E6.2_67996641 or I40.1_13076665, also depending on whether they are subexon1 or subexon2
    ENSG00000213934:E3.1 or ENSG00000213934:E2.1_473843893894
    U0.1_49689185
    2. everything with trailing suffix will depend on the subexon1 or subexon2, but sometimes, it is fixed (trans-splicing can only be in subexon2)
    3. to be clear, the exon_seq returned is always 5'-3' sequence, not forward anymore.

    if code is 1, meaning its consecutive, E3.3-E3.4, if the flag is site2, then offset the start position of it by 1, the start position is the conceptual
    start position, if it is in negative string, it will be operated by the stop position.
    '''
    # let's remove those in the first place
    # if EnsID not in set(dict_exonCoords.keys()):
    #     exon_seq = '*' * 10  # indicator for error on MultiPath-PSI itself
    #     return exon_seq
        
    try:   # E1.2 or I3.4
        attrs = dict_exonCoords[EnsID][subexon]  # [chr,strand,start,end,suffer] 
        if code == 1 and flag == 'site2':
            if attrs[1] == '+':
                exon_seq = query_from_dict_fa(int(attrs[2])+1,attrs[3],EnsID,attrs[1])   
            else:
                exon_seq = query_from_dict_fa(attrs[2],int(attrs[3])-1,EnsID,attrs[1])
        else:
            exon_seq = query_from_dict_fa(attrs[2],attrs[3],EnsID,attrs[1])
    except KeyError:
        if ':' in subexon: # ENSG00000213934:E3.1
            fusionGeneEnsID = subexon.split(':')[0] 
            fusionGeneExon = subexon.split(':')[1]        
            if  '_' in fusionGeneExon:   # ENSG:E2.1_473843893894
                suffix = fusionGeneExon.split('_')[1]
                subexon = fusionGeneExon.split('_')[0]
                attrs = dict_exonCoords[fusionGeneEnsID][subexon]
                if attrs[1] == '+':  
                    exon_seq = query_from_dict_fa(suffix,attrs[3],fusionGeneEnsID,attrs[1]) 
                else:  
                    exon_seq = query_from_dict_fa(attrs[2],suffix,fusionGeneEnsID,attrs[1])
            else:  # ENSG:E2.1
                try:
                    attrs = dict_exonCoords[fusionGeneEnsID][fusionGeneExon]
                except KeyError:
                    exon_seq = '*' * 10  # indicator for error on MultiPath-PSI itself
                else:
                    exon_seq = query_from_dict_fa(attrs[2],attrs[3],fusionGeneEnsID,attrs[1]) 

        else:  # could be trailing or utr, or non-existing ordinary subexon
            try:
                suffix = subexon.split('_')[1]
            except IndexError: # the logic is there's a subexon E45.3, it is no trailing, but just not in the exonCoords.
                exon_seq = '*' * 10  # indicator for error on MultiPath-PSI itself
            else:
                subexon = subexon.split('_')[0]
                try:
                    attrs = dict_exonCoords[EnsID][subexon]
                except KeyError:  # must be UTR
                    chrUTR,strandUTR = utrAttrs(EnsID) # this is get from a random subexon under that EnsID
                    exon_seq = utrJunction(suffix,EnsID,strandUTR,chrUTR,flag)  
                else:   # must be trailing
                    if flag == 'site2':
                        if attrs[1] == '+':  
                            exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1]) 
                        else:  
                            exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1])
                    elif flag == 'site1':  # not affected by overhang since it is site1
                        if attrs[1] == '+': 
                            exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1])
                        else:
                            exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1])
    return exon_seq


[docs]def enhance_frequency_table(df,remove_quote=True,save=True,outdir='',name=None):
    '''
    This is a wrapper function to add (1) gene symbol, (2) chromosome coordinate, (3) tumor specificity to the frequency table at once

    :param df: the dataframe for the frequency table (freq_stage{n}_verbosity1_uid.txt)
    :param remove_quote: boolean, depending on how your df is loaded, if directly read from the disk, certain column will contain quotation, whether to remove it or not
    :param save: boolean, whether to save the result or not, default is True
    :param outdir: string, the output folder.
    :param name: string, the output file name

    Example::

        enhance_frequency_table(df=df,True,True,'result','freq_stage3_verbosity1_uid_gene_symbol_coord_mean_mle.txt')

    '''
    print('adding gene symbol')
    df = add_gene_symbol_frequency_table(df=df,remove_quote=remove_quote)
    print('adding chromosome coordinates')
    df = add_coord_frequency_table(df,remove_quote=False)
    print('adding tumor specificity mean raw count')
    df = add_tumor_specificity_frequency_table(df,'mean',False)
    print('adding tumor specificity MLE score')
    df = add_tumor_specificity_frequency_table(df,'mle',False)
    if save:
        df.to_csv(os.path.join(outdir,name),sep='\t')
    return df

[docs]def add_coord_frequency_table(df,remove_quote=True):
    '''
    Convert the uid to chromsome coordinates

    :param df: the input df, index must be uid
    :param remove_quote: bool, depending on how your df is loaded, if directly read from the disk, certain column will contain quotation, whether to remove it or not
    :return df: the output df, with added coordinate column

    Exmaple::

        add_coord_frequency_table(df=df,remove_quote=True)
    '''
    # the index has to be comma separated string
    from ast import literal_eval
    if remove_quote:
        df['samples'] = [literal_eval(item) for item in df['samples']]   
    uid_list = [item.split(',')[1] for item in df.index] 
    coord_list = [uid_to_coord(uid) for uid in uid_list]
    df['coord'] = coord_list
    return df


def uid_to_coord(uid):
    tmp_list = uid.split(':')
    if len(tmp_list) == 2:
        ensg,exons = tmp_list
    elif len(tmp_list) == 3:
        ensg = tmp_list[0]
        exons = ':'.join(tmp_list[1:])
    first,second = exons.split('-')
    # figure out start_coord
    if '_' in first:
        actual_exon,trailing = first.split('_')
        try:
            attrs = dict_exonCoords[ensg][actual_exon]
        except KeyError:
            if 'U' in actual_exon:
                proxy_exon = list(dict_exonCoords[ensg].keys())[0]
                attrs = dict_exonCoords[ensg][proxy_exon]
                chrom = attrs[0]
                strand = attrs[1]
                start_coord = trailing
            else:   # probably a rare error
                chrom = 'unknown'
                strand = 'unknown'
                start_coord = 'unknown'
        else:
            chrom = attrs[0]
            strand = attrs[1]
            start_coord = trailing
    else:
        actual_exon = first
        try:
            attrs = dict_exonCoords[ensg][actual_exon]
        except KeyError:
            chrom = 'unkonwn'
            strand = 'unknown'
            start_coord = 'unknown'
        else:
            chrom = attrs[0]
            strand = attrs[1]
            if strand == '+':
                start_coord = attrs[3]  # end
            else:
                start_coord = attrs[2]  # start
    
    # figure out end_coord
    if '_' in second:
        actual_exon,trailing = second.split('_')
        try:
            attrs = dict_exonCoords[ensg][actual_exon]
        except KeyError:
            if 'U' in actual_exon:
                end_coord = trailing
            elif 'ENSG' in actual_exon:
                end_coord = trailing
            else:
                end_coord = 'unknown'      
        else:
            end_coord = trailing
    else:
        actual_exon = second
        try:
            attrs = dict_exonCoords[ensg][actual_exon]
        except KeyError:
            if 'ENSG' in actual_exon:
                ensg_second, actual_exon_second = actual_exon.split(':')
                try:
                    attrs = dict_exonCoords[ensg_second][actual_exon_second]
                except KeyError:
                    end_coord = 'unknown'
                else:
                    if strand == '+':
                        end_coord = attrs[2]  # start
                    else:
                        end_coord = attrs[3]  # end
            else:
                end_coord = 'unknown'
        else:
            if strand == '+':
                end_coord = attrs[2]  # start
            else:
                end_coord = attrs[3]  # end
    
    # assemble
    if strand == '+':
        assemble = '{}:{}-{}({})'.format(chrom,start_coord,end_coord,strand)
    else:
        assemble = '{}:{}-{}({})'.format(chrom,end_coord,start_coord,strand)

    return assemble