Source code for snaf.proteomics

#!/data/salomonis2/LabFiles/Frank-Li/refactor/neo_env/bin/python3.7

import numpy as np
import pandas as pd
import os
import sys
import pickle
import h5py
import matplotlib.pyplot as plt
import xmltodict
import copy
from tqdm import tqdm

from Bio.SeqIO.FastaIO import SimpleFastaParser

########################## Following is for manipulating fasta db files
[docs]def chop_normal_pep_db(fasta_path,output_path,mers,allow_duplicates): ''' chop any normal human proteome to certain mers :param fasta_path: string, the path to the human protein fasta file :param output_path: string, the path to the output fasta file :param mers: list, like [9,10] will generate 9mer and 10mer :param allow_duplicates: boolean. whether allow duplicate or not Example:: chop_normal_pep_db(fasta_path='human_uniprot_proteome.fasta',output_path='./human_uniprot_proteome_mer9_10.fasta',mers=[9,10],allow_duplicates=False) ''' # for human genome in uniprot, 9-10mer, remove duplicates will decrease from 44,741,578 to 41,638,172 if allow_duplicates: with open(fasta_path,'r') as in_handle, open(output_path,'w') as out_handle: for title,seq in tqdm(SimpleFastaParser(in_handle)): count = 0 length = len(seq) for i in range(length): for mer in mers: if i+mer <= length: out_handle.write('>{}_{}_{}\n'.format(title,mer,count)) out_handle.write('{}\n'.format(seq[i:i+mer])) count += 1 else: with open(fasta_path,'r') as in_handle, open(output_path,'w') as out_handle: existing = set() for title,seq in tqdm(SimpleFastaParser(in_handle)): count = 0 length = len(seq) for i in range(length): for mer in mers: if i+mer <= length: subseq = seq[i:i+mer] if subseq not in existing: out_handle.write('>{}_{}_{}\n'.format(title,mer,count)) out_handle.write('{}\n'.format(subseq)) existing.add(subseq) count += 1
[docs]def compare_two_fasta(fa1_path,fa2_path,outdir='.',write_comm=False,write_unique1=False,write_unique2=False,prefix=''): ''' Compare arbitracy two fasta files, report unique and common ones :param fa1_path: the first fasta file path that you want to compare with :param fa2_path: the second fasta file path that you want to compare with :param write_comm: boolean, whether write the common one between fa1 and fa2 :param write_unique1: boolean, whether write the one unique to fa1 :param write_unique2: boolean, whether write the one unique to fa2 :param prefix: string, whether to add a prefix to the output fasta file name Example:: snaf.proteomics.compare_two_fasta(fa1_path='./fasta/human_proteome_uniprot_9_10_mers_unique.fasta', fa2_path='./fasta/neoantigen_{}_unique.fasta'.format(sample),outdir='./fasta', write_unique2=True,prefix='{}_'.format(sample)) ''' seq1,seq2 = set(),set() seq1_l,seq2_l = [],[] seq1_t,seq2_t = [],[] for i,path in enumerate([fa1_path,fa2_path]): with open(path,'r') as f: for t,s in SimpleFastaParser(f): if i == 0: if s not in seq1: seq1.add(s) seq1_l.append(s) seq1_t.append(t) elif i == 1: if s not in seq2: seq2.add(s) seq2_l.append(s) seq2_t.append(t) comm = seq1.intersection(seq2) unique1 = seq1.difference(seq2) unique2 = seq2.difference(seq1) print('common:{}\nunique1:{}\nunique2:{}'.format(len(comm),len(unique1),len(unique2))) if write_comm: print('writing comm') with open(os.path.join(outdir,'{}comm.fasta'.format(prefix)),'w') as f: for item in comm: seq1_list = seq1_l item_t = seq1_t[seq1_list.index(item)] f.write('>{}\n{}\n'.format(item_t,item)) if write_unique1: print('writing unique1') with open(os.path.join(outdir,'{}unique1.fasta'.format(prefix)),'w') as f: for item in unique1: seq1_list = seq1_l item_t = seq1_t[seq1_list.index(item)] f.write('>{}\n{}\n'.format(item_t,item)) if write_unique2: print('writing unique2') with open(os.path.join(outdir,'{}unique2.fasta'.format(prefix)),'w') as f: for item in unique2: seq2_list = seq2_l item_t = seq2_t[seq2_list.index(item)] f.write('>{}\n{}\n'.format(item_t,item))
[docs]def remove_redundant(fasta_path,out_path): ''' remove redundant entries in any fasta files :param fasta_path: string, the path to the input fasta file :param out_path: string, the path to the output folder Example:: snaf.proteomics.remove_redundant('./fasta/neoantigen_{}.fasta'.format(sample),'./fasta/neoantigen_{}_unique.fasta'.format(sample)) ''' existing = set() original_count = 0 with open(fasta_path,'r') as f1, open(out_path,'w') as f2: for t,s in SimpleFastaParser(f1): original_count += 1 if s not in existing: f2.write('>{}\n{}\n'.format(t,s)) existing.add(s) print('reduce from {} down to {}'.format(original_count,len(existing)))
######################## Marks the end of fasta db manipulation ####################### following functions are all for setting mqpar.xml file for maxQuant def add_database_file(doc,dbs): for db in dbs: try: template = copy.deepcopy(doc['MaxQuantParams']['fastaFiles']['FastaFileInfo'][-1]) # non-first time except: template = copy.deepcopy(doc['MaxQuantParams']['fastaFiles']['FastaFileInfo']) # first time finally: template['fastaFilePath'] = db template['identifierParseRule'] = '>(.*)' template['descriptionParseRule'] = '>(.*)' try: doc['MaxQuantParams']['fastaFiles']['FastaFileInfo'].append(template) except: doc['MaxQuantParams']['fastaFiles']['FastaFileInfo'] = [] doc['MaxQuantParams']['fastaFiles']['FastaFileInfo'].append(template) return doc def numThreads(doc,thread): doc['MaxQuantParams']['numThreads'] = str(thread) return doc def add_input_files(doc,inputs): experiment = 1 for inp in inputs: # 'filePaths' if not type(doc['MaxQuantParams']['filePaths']['string']) == list: doc['MaxQuantParams']['filePaths']['string'] = [] doc['MaxQuantParams']['filePaths']['string'].append(inp) else: doc['MaxQuantParams']['filePaths']['string'].append(inp) # 'experiments' if not type(doc['MaxQuantParams']['experiments']['string']) == list: doc['MaxQuantParams']['experiments']['string'] = [] doc['MaxQuantParams']['experiments']['string'].append(experiment) else: doc['MaxQuantParams']['experiments']['string'].append(experiment) experiment += 1 # 'fractions' if not type(doc['MaxQuantParams']['fractions']['short']) == list: doc['MaxQuantParams']['fractions']['short'] = [] doc['MaxQuantParams']['fractions']['short'].append('32767') else: doc['MaxQuantParams']['fractions']['short'].append('32767') # 'ptms' if not type(doc['MaxQuantParams']['ptms']['boolean']) == list: doc['MaxQuantParams']['ptms']['boolean'] = [] doc['MaxQuantParams']['ptms']['boolean'].append('False') else: doc['MaxQuantParams']['ptms']['boolean'].append('False') # 'paramGroupIndices' if not type(doc['MaxQuantParams']['paramGroupIndices']['int']) == list: doc['MaxQuantParams']['paramGroupIndices']['int'] = [] doc['MaxQuantParams']['paramGroupIndices']['int'].append('0') else: doc['MaxQuantParams']['paramGroupIndices']['int'].append('0') # 'referenceChannel' if not type(doc['MaxQuantParams']['referenceChannel']['string']) == list: doc['MaxQuantParams']['referenceChannel']['string'] = [] doc['MaxQuantParams']['referenceChannel']['string'].append(None) else: doc['MaxQuantParams']['referenceChannel']['string'].append(None) return doc def change_enzymes(doc,enzymes,mode): ''' mode 0: specific mode 3: semi-specific mode 4: unspecific mode 5: no digestion ''' doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymeMode'] = str(mode) if enzymes == None: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymes'] = None else: for enzyme in enzymes: if not type(doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymes']['string']) == list: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymes']['string'] = [] doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymes']['string'].append(enzyme) else: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['enzymes']['string'].append(enzyme) return doc def change_fdr(doc,protein_fdr,peptide_fdr,site_fdr): doc['MaxQuantParams']['proteinFdr'] = protein_fdr doc['MaxQuantParams']['peptideFdr'] = peptide_fdr doc['MaxQuantParams']['siteFdr'] = site_fdr return doc def change_contaminants(doc,include): doc['MaxQuantParams']['includeContaminants'] = str(include) return doc def change_variable_modifications(doc,mods=['Oxidation (M)', 'Acetyl (Protein N-term)', 'Carbamidomethyl (C)']): if mods is not None: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['variableModifications']['string'] = mods else: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['variableModifications'] = mods return doc def change_fixed_modifications(doc,mods=None): if mods is not None: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['fixedModifications']['string'] = [] for mod in mods: doc['MaxQuantParams']['parameterGroups']['parameterGroup']['fixedModifications']['string'].append(mod) return doc def change_length(doc,minPepLen,maxPeptideMass,minPeptideLengthForUnspecificSearch,maxPeptideLengthForUnspecificSearch): doc['MaxQuantParams']['minPepLen'] = minPepLen doc['MaxQuantParams']['maxPeptideMass'] = maxPeptideMass doc['MaxQuantParams']['minPeptideLengthForUnspecificSearch'] = minPeptideLengthForUnspecificSearch doc['MaxQuantParams']['maxPeptideLengthForUnspecificSearch'] = maxPeptideLengthForUnspecificSearch return doc
[docs]def set_maxquant_configuration(base,dbs,n_threads,inputs,enzymes,enzyme_mode,outdir, outname='mqpar.xml',protein_fdr=0.01,peptide_fdr=0.01,site_fdr=0.01,include_contaminants=True,var_mods=['Oxidation (M)', 'Acetyl (Protein N-term)', 'Carbamidomethyl (C)'],fix_mods=None, minPepLen=8,maxPeptideMass=4600,minPeptideLengthForUnspecificSearch=8,maxPeptideLengthForUnspecificSearch=25): ''' automatically build MaxQuant configuration file mqpar.xml, the template mqpar.xml is generated using maxquant 2.0.3.1 on a windows PC GUI, we load a single raw file, a single fasta file, reduce carbamidomethyl from fixed modification to variable modification and allow match run, otherwise, we retain all default parameters. :param base: string, path to the base mqpar.xml file, we proivde orbitrap_match_run_v2.0.3.1 and bruker_tims_match_run_v2.0.3.1 on Github :param dbs: list, each element is the path to the database fasta file :param n_threads: int, how many threads will be used for MaxQuant search :param inputs: list, each element is the path to the raw file :param enzymes: list, what enzymes will be used for search :param enzyme_mode: int, 0 -> specific, 3 -> semi-specific, 4 -> unspecific, 5 -> no digestion :param outdir: string, the output directory for the conf file :param outname: string, default is mqpar.xml :param protein_fdr: float, default is 0.01 :param peptide_fdr: float, default is 0.01 :param site_fdr: float, default is 0.01 :param include_contaminants: boolean, default is True :param var_mods: list or None, default is ['Oxidation (M)', 'Acetyl (Protein N-term)', 'Carbamidomethyl (C)'] :param fix_mods: list or None, default is [] :param minPepLen: int, default is 8 :param maxPeptideMass: int, default is 4600 :param minPeptideLengthForUnspecificSearch: int, default is 8 :param maxPeptideLengthForUnspecificSearch: int, default is 25 Example:: dbs = ['/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/RNA/snaf_analysis/fasta/SRR5933726.Aligned.sortedByCoord.out.bed_unique2.fasta'] inputs = ['/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#1.raw', '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#2.raw', '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#3.raw', '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#4.raw', '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#5.raw', '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48/OvCa48_classI_Rep#6.raw'] outdir = '/data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/schuster/MS/OvCa48' snaf.proteomics.set_maxquant_configuration(base='orbitrap_match_run_v2.0.3.1',dbs=dbs,n_threads=20,inputs=inputs,enzymes=None,enzyme_mode=5,protein_fdr=1,peptide_fdr=0.05,site_fdr=1,outdir=outdir) ''' with open (base,'r') as fr: doc = xmltodict.parse(fr.read()) doc = add_database_file(doc,dbs) doc = numThreads(doc,n_threads) doc = add_input_files(doc,inputs) doc = change_enzymes(doc,enzymes,enzyme_mode) doc = change_fdr(doc,protein_fdr=protein_fdr,peptide_fdr=peptide_fdr,site_fdr=site_fdr) doc = change_contaminants(doc,include_contaminants) doc = change_variable_modifications(doc,var_mods) doc = change_fixed_modifications(doc,fix_mods) doc = change_length(doc,minPepLen,maxPeptideMass,minPeptideLengthForUnspecificSearch,maxPeptideLengthForUnspecificSearch) a = xmltodict.unparse(doc,pretty=True,encoding='utf-8') a = a.replace('&gt;','>') with open(os.path.join(outdir,outname),'w') as fw: fw.write(a) ''' Download the maxQuant (working version, 1.6.14 and 2.0.3.1) zip from official website, on windowns, just lanuch exe, follow the instructions, .Net framework should be fine, if .Net core 3.1 is needed, just install it and verify it in cmd. On Linux, Using below instructions. SAMPLE=OvCa114 cd /data/salomonis2/LabFiles/Frank-Li/neoantigen/revision/ovarian/MS/${SAMPLE} module load mono/5.20.1 export PATH=$PATH:​/usr/local/mono/5.20.1/bin mono /data/salomonis2/LabFiles/Frank-Li/neoantigen/MS/MaxQuant_2.0.3.1/bin/MaxQuantCmd.exe /data/salomonis2/LabFiles/Frank-Li/neoantigen/revision/ovarian/MS/${SAMPLE}/mqpar.xml '''
####################### Here marks the end of maxQuant configuration
[docs]def summarize_ms_result(peptide,msms,freq,outdir): ''' After finishing running Maxquant, this function can help figuring out what peptide for further validation :param peptide: string, the path to MaxQuant peptide.txt file :param msms: string, the path to the MaxQuant msms.txt file :param freq: string, the path to the SNAF frequency_stage2_verbosity1_uid_gene_symbol_coord_mean_mle.txt file :param outdir: string, the path to the output folder :return df_peptide: dataframe, the valid MS-evidenced neoantigens with associated properties Examples:: df = snaf.proteomics.summarize_ms_result(peptide='MS/OvCa114/combined/txt/peptides.txt',msms='MS/OvCa114/combined/txt/msms.txt',freq='result/frequency_stage2_verbosity1_uid_gene_symbol_coord_mean_mle.txt') ''' df_peptide = pd.read_csv(peptide,sep='\t',index_col=0) df_peptide = df_peptide.loc[df_peptide['Proteins'].notna(),:].sort_values(by='PEP') # add msms df_msms = pd.read_csv(msms,sep='\t',index_col='id') dic_msms_matches = df_msms['Matches'].to_dict() dic_msms_intensities = df_msms['Intensities'].to_dict() df_peptide['best_msms_matches'] = df_peptide['Best MS/MS'].map(dic_msms_matches) df_peptide['best_msms_intensities'] = df_peptide['Best MS/MS'].map(dic_msms_intensities) # add freq df_freq = pd.read_csv(freq,sep='\t',index_col=0) df_freq['pep'] = [item.split(',')[0] for item in df_freq.index] df_freq['uid'] = [item.split(',')[1] for item in df_freq.index] from ast import literal_eval df_freq['samples'] = [literal_eval(item) for item in df_freq['samples']] dic_pep_uid = {} dic_pepuid_property = {} for row in df_freq.itertuples(): dic_pep_uid.setdefault(row.pep,[]).append(row.uid) dic_pepuid_property[row.Index] = (row.samples,row.n_sample,row.symbol,row.coord,row.tumor_specificity_mean,row.tumor_specificity_mle) df_peptide['uids'] = df_peptide.index.map(dic_pep_uid).values df_peptide['properties'] = [[dic_pepuid_property[','.join([pep,uid])] for uid in row] for row,pep in zip(df_peptide['uids'],df_peptide.index)] df_peptide.to_csv(os.path.join(outdir,'summaried_ms_results.txt'),sep='\t') return df_peptide