Source code for snaf.__init__

from .snaf import snaf_configuration,NeoJunction,JunctionCountMatrixQuery,uid_to_coord, add_coord_frequency_table, enhance_frequency_table
from .gtex import gtex_configuration, tumor_specificity, add_tumor_specificity_frequency_table, get_all_normal_h5ad
from .gtex_viewer import gtex_viewer_configuration, gtex_visual_combine, gtex_visual_subplots, gtex_visual_combine_plotly, gtex_visual_per_tissue_count
from .proteomics import *
from .downstream import *
from datetime import datetime,date
from .dash_app import run_dash_T_antigen,run_pweblogo
import os,sys

[docs]def initialize(df,db_dir,gtex_mode='count',software_path=None,binding_method=None,t_min=20,n_max=3, normal_cutoff=5, tumor_cutoff=20, normal_prevalance_cutoff=0.01, tumor_prevalance_cutoff=0.1, add_control=None): ''' Setting up global variable for running the program :param df: dataframe, the currently tested cancer cohort dataframe, this will limit number of junctions being loaded into memory for control database :param db_dir: string, the path to the database :param gtex_mode: string, either 'psi' or 'count' :param software_path: string or None, either the path to the netMHCpan4.1 executable or None (using MHCflurry) :param binding_method: string or None, either 'netMHCpan' or 'MHCflurry' :param t_min: int, the minimum number of read count the tumor sample should be larger than average number in normal database :param n_max: int, the maximum number of average read count normal database count have :param normal_cutoff: int, below which read count we consider a junction is not expressed in normal tissue, for prevalance mode :param tumor_cutoff: int, above which read count we consider a junction is expressed in tumor tissue, for prevalance mode :param normal_prevalance_cutoff: float, if below this fraction, we consider a junction is not present in normal tissue at an appreciable amount :param tumor_prevalance_cutoff: float, if above this fraction, we consider a junction is present in tumor tissue at an appreciable amount :param add_control: None or a dictionary containing additional controls, additional controls can a dataframe or anndata, for instance, if adding two controls asides from internal GTEx, {'tcga_matched_control':df,'gtex_skin':adata}, when added database contains multiple tissue types, it is recommended to pass as a AnnData with the tissue information stored as adata.var['tissue'], otherwise, if passed as a dataframe, or samples will be recognized as a single tissue type, it will affect tumor specifcity score calculation if using MLE or bayesian hierarchical models. Example:: # database directory (where you extract the reference tarball file) db_dir = '/user/ligk2e/download' # instantiate (if using netMHCpan) netMHCpan_path = '/user/ligk2e/netMHCpan-4.1/netMHCpan' snaf.initialize(db_dir=db_dir,gtex_mode='count',binding_method='netMHCpan',software_path=netMHCpan_path) # instantiate (if not using netMHCpan) snaf.initialize(db_dir=db_dir,gtex_mode='count',binding_method='MHCflurry',software_path=None) ''' print('{} {} starting initialization'.format(date.today(),datetime.now().strftime('%H:%M:%S'))) exon_table = os.path.join(db_dir,'Alt91_db','Hs_Ensembl_exon_add_col.txt') transcript_db = os.path.join(db_dir,'Alt91_db','mRNA-ExonIDs.txt') fasta = os.path.join(db_dir,'Alt91_db','Hs_gene-seq-2000_flank.fa') # 2000 only affect the query_from_dict_fa function if gtex_mode == 'count': gtex_db = os.path.join(db_dir,'controls','GTEx_junction_counts.h5ad') # gtex_db = '/data/salomonis2/LabFiles/Frank-Li/neoantigen/revision/ts/gene/gtex_count.h5ad' elif gtex_mode == 'psi': gtex_db = os.path.join(db_dir,'controls','GTEx_junction_psi.h5ad') snaf_configuration(exon_table,transcript_db,db_dir,fasta,software_path,binding_method) adata = gtex_configuration(df,gtex_db,t_min,n_max,normal_cutoff, tumor_cutoff, normal_prevalance_cutoff, tumor_prevalance_cutoff, add_control) gtex_viewer_configuration(adata) print('{} {} finishing initialization'.format(date.today(),datetime.now().strftime('%H:%M:%S'))) # create a scratch folder scratch_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'scratch') if not os.path.exists(scratch_dir): os.mkdir(scratch_dir)
def get_reduced_junction_matrix(pc,pea,frac=None,n=None,samples=None): # preprocess the dataframe df = pd.read_csv(pc,sep='\t',index_col=0) df.index = [item.split('=')[0] for item in df.index] df = df.loc[np.logical_not(df.index.duplicated()).tolist(),:] print('original shape: {}'.format(df.shape)) if samples is not None: print('{} samples have HLA type info, will subset to these samples'.format(len(samples))) # filter to EventAnnotation file ee = [':'.join(item.split('|')[0].split(':')[1:]) for item in pd.read_csv(pea,sep='\t')['UID']] df = df.loc[df.index.isin(set(ee)),:] # for testing purpose if frac is not None: df = df.sample(frac=frac) if n is not None: df = df.sample(n=n) # if need filtering out or rearranging samples if samples is not None: df = df.loc[:,df.columns.isin(samples)] print('current shape: {}'.format(df.shape)) return df def remove_trailing_coord(count,sep='\t'): df = pd.read_csv(count,sep=sep,index_col=0) df.index = [item.split('=')[0] for item in df.index] df = df.loc[np.logical_not(df.index.duplicated()).tolist(),:] return df