Source code for snaf.surface.main

import pandas as pd
import numpy as np
import os,sys
import subprocess
import multiprocessing as mp
import re
import pickle
from datetime import datetime,date
from ast import literal_eval
import requests
import xmltodict
from tqdm import tqdm
from bisect import bisect
from .data_io import *
from .orf_finder import *
from .orf_check import *
from .alignment import *
from .api import *


import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'


[docs]def initialize(db_dir): ''' setting up additional global variables for running B antigen program :param db_dir: string, the path to the database folder Examples:: from snaf import surface surface.initialize(db_dir=db_dir) ''' global df_exonlist global dict_exonCoords global dict_fa global dict_biotype global df_membrane_proteins global dict_uni_fa global df_topology print('{} {} starting surface antigen initialization'.format(date.today(),datetime.now().strftime('%H:%M:%S'))) transcript_db = os.path.join(db_dir,'Alt91_db','mRNA-ExonIDs.txt') exon_table = os.path.join(db_dir,'Alt91_db','Hs_Ensembl_exon_add_col.txt') fasta = os.path.join(db_dir,'Alt91_db','Hs_gene-seq-2000_flank.fa') biotype_db = os.path.join(db_dir,'Alt91_db','Hs_Ensembl_transcript-biotypes.txt') membrane_db = os.path.join(db_dir,'Alt91_db','human_membrane_proteins_acc2ens.txt') membrane_fasta_db = os.path.join(db_dir,'Alt91_db','uniprot_isoform_enhance.fasta') df_exonlist = pd.read_csv(transcript_db,sep='\t',header=None,names=['EnsGID','EnsTID','EnsPID','Exons']) # index is number dict_exonCoords = exonCoords_to_dict(exon_table) dict_fa = fasta_to_dict(fasta) dict_biotype = biotype(pd.read_csv(biotype_db,sep='\t')) # index is number df_membrane_proteins = pd.read_csv(membrane_db,sep='\t',index_col=0) dict_uni_fa = read_uniprot_seq(membrane_fasta_db) df_topology = pd.read_csv(os.path.join(db_dir,'Alt91_db','ENSP_topology.txt'),sep='\t') print('{} {} finished surface antigen initialization'.format(date.today(),datetime.now().strftime('%H:%M:%S')))
[docs]def generate_full_results(outdir,freq_path,mode,validation_gtf): ''' preferred api for wraping both generate_results and report_candiates, good for general usage :param outdir: string, the output folder :param freq_path: string, the path to the frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt file from T antigen pipeline :param mode: string, either long_read or short_read :param validation_gtf: string, the path to the validation long_read gtf when mode=short_read Examples:: # long read mode surface.generate_full_results(outdir='result/surface',freq_path='result/frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt',mode='long_read') # short read mode surface.generate_full_results(outdir='result/surface',freq_path='result/frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt',mode='short_read',validation_gtf='/data/salomonis2/LabFiles/Frank-Li/neoantigen/TCGA/SKCM/snaf_analysis/SQANTI-all/collapse_isoforms_classification.filtered_lite.gtf') ''' if mode == 'long_read': for style in [None,'deletion','insertion']: for overlap_extracellular in [True,False]: cc,cf = generate_results(pickle_path=os.path.join(outdir,'surface_antigen_lr.p'),strigency=3,outdir=outdir,long_read=True,style=style,overlap_extracellular=overlap_extracellular) if cc > 0: report_candidates(pickle_path=os.path.join(outdir,'surface_antigen_lr.p'), candidates_path=os.path.join(outdir,'candidates_3_lr_{}_{}.txt'.format(style,overlap_extracellular)), validation_path=os.path.join(outdir,'validation_3_lr_{}_{}.txt'.format(style,overlap_extracellular)), freq_df_path=freq_path, mode='long_read',outdir=os.path.join(outdir,'B_candidates'),name='lr_str3_report_{}_{}.txt'.format(style,overlap_extracellular)) elif mode == 'short_read': # first strigency 5 for style in [None,'deletion','insertion']: for overlap_extracellular in [True,False]: cc,cf = generate_results(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'),strigency=5,outdir=outdir,long_read=False,style=style,overlap_extracellular=overlap_extracellular,gtf=validation_gtf) if cc > 0: report_candidates(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'), candidates_path=os.path.join(outdir,'candidates_5_sr_{}_{}.txt'.format(style,overlap_extracellular)), validation_path=os.path.join(outdir,'validation_5_sr_{}_{}.txt'.format(style,overlap_extracellular)), freq_df_path=freq_path, mode='short_read',outdir=os.path.join(outdir,'B_candidates'),name='sr_str5_report_{}_{}.txt'.format(style,overlap_extracellular)) # next strigency 4 for style in [None,'deletion','insertion']: for overlap_extracellular in [True,False]: cc, cf = generate_results(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'),strigency=4,outdir=outdir,long_read=False,style=style,overlap_extracellular=overlap_extracellular,gtf=validation_gtf) if cc > 0: report_candidates(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'), candidates_path=os.path.join(outdir,'candidates_4_sr_{}_{}.txt'.format(style,overlap_extracellular)), validation_path=os.path.join(outdir,'validation_4_sr_{}_{}.txt'.format(style,overlap_extracellular)), freq_df_path=freq_path, mode='short_read',outdir=os.path.join(outdir,'B_candidates'),name='sr_str4_report_{}_{}.txt'.format(style,overlap_extracellular)) # then strigency 3 for style in [None,'deletion','insertion']: for overlap_extracellular in [True,False]: cc, cf = generate_results(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'),strigency=3,outdir=outdir,long_read=False,style=style,overlap_extracellular=overlap_extracellular,gtf=None) if cc > 0: report_candidates(pickle_path=os.path.join(outdir,'surface_antigen_sr.p'), candidates_path=os.path.join(outdir,'candidates_3_sr_{}_{}.txt'.format(style,overlap_extracellular)), validation_path=os.path.join(outdir,'validation_3_sr_{}_{}.txt'.format(style,overlap_extracellular)), freq_df_path=freq_path, mode='short_read',outdir=os.path.join(outdir,'B_candidates'),name='sr_str3_report_{}_{}.txt'.format(style,overlap_extracellular))
def _run_dash_prioritizer_return_events(candidates): # candidates is a list of each lines, containing the newline symbol collect = [] for i,line in enumerate(candidates): if i % 14 == 0: collect.append(line.rstrip('\n')[4:]) return collect def _run_dash_prioritizer_return_valid_indices(candidates,collect_uid,uid): index = collect_uid.index(uid) line = candidates[index*14+11] valid_indices = literal_eval('[' + line.rstrip('\n').split('[')[-1]) return valid_indices def _run_dash_prioritizer_return_sa(results,gene): for sa in results: if sa.uid == gene: return sa def _run_dash_prioritizer_return_gene(candidates): collect = [] for i,line in enumerate(candidates): if i % 14 == 12: collect.append(line.rstrip('\n')[12:]) return collect def _run_dash_long_read_mode(pkl,candidates,python_executable,host=None,port='8050'): import dash from dash import dcc,html,dash_table from dash.dependencies import Input,Output,State import dash_dangerously_set_inner_html import plotly.graph_objects as go with open(pkl,'rb') as f1: results = pickle.load(f1) # a list of sa object sa, df_certain, isoforms = None, None, None # a binding for further nonlocal declaration with open(candidates,'r') as f2: candidates = f2.readlines() # a list of each lines, containing the newline symbol collect_uid = _run_dash_prioritizer_return_events(candidates) # a list of all uid collect_gene = _run_dash_prioritizer_return_gene(candidates) # a list of all gene symbol app = dash.Dash(__name__) app.layout = html.Div([ html.Div([html.H2('SNAF B-antigen Viewer'),html.Br(),html.Label('Splicing Event UID: ',style={'font-weight':'bold'}),dcc.Input(id='event_selection',value=collect_uid[0],type='text',style={'width':'40%'})],style={'text-align':'center'}), html.Div([html.Br(),html.H2(id='exon_h2'),html.Br(),html.H2('Expression (Normal --> Tumor)'),html.Br(),dcc.Graph(id='expression')],style={'text-align':'center'}), html.Div([html.H2('All Exons in AltAnalyze Gene Model'),dash_table.DataTable(id='exon',columns=[{'name':column,'id':column} for column in ['subexon','chromosome','strand','start','end']],page_size=10)]), html.Br(), html.Hr(), # start html.Div([html.H2('Novel transcript to display'),dcc.Dropdown(id='novel_transcript')],style={'text-align':'center'}), html.Div([html.H2('Existing isoform to display'),dcc.Dropdown(id='existing_isoform')],style={'text-align':'center'}), html.Div([html.Button(id='submit',n_clicks=0,children='Submit',style={'width':'10%'})],style={'text-align':'center'}), html.Div([html.H2('Novel sequence'),html.Br(),html.P(id='sequence')]), html.Div([html.H2('Exist sequence'),html.Br(),html.P(id='ensembl')]), html.Div([html.H2('Emboss Needle Alignment (peptide)'),html.Br(),html.P(id='alignment',style={'white-space':'pre','font-family':'monospace'})]), # end html.Br(), html.Hr(), html.Div([html.H2('Downstream link'), html.A(id='ensembl_link',href='http://useast.ensembl.org/Homo_sapiens/Info/Index',children='Ensembl Human'), html.Br(), html.A(id='Emboss_link',href='https://www.ebi.ac.uk/Tools/psa/emboss_needle/',children='Emboss Peptide Global Alignment'), html.Br(), html.A(id='TMHMM_link',href='https://services.healthtech.dtu.dk/service.php?TMHMM-2.0',children='TMHMM: predicting transmembrane domain'), html.Br(), html.A(id='SABLE_link',href='https://sable.cchmc.org/',children='SABLE: predicting solvebility and secondary structure'), html.Br(), html.A(id='alphafold2_link',href='https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb',children='Colab version of alphafold2'), html.Br(), html.A(id='uniprot_link',href='https://www.uniprot.org/',children='Uniprot for protein'), html.Br(), html.A(id='ModFold_link',href='https://www.reading.ac.uk/bioinf/ModFOLD/',children='ModFold: 3D model assesser')]) ]) # function we need to define for running the app @app.callback(Output('expression','figure'),Output('exon_h2','children'),Output('exon','data'),Output('novel_transcript','options'),Output('existing_isoform','options'),Input('event_selection','value')) def select_event_show_table(value): nonlocal sa nonlocal df_certain nonlocal isoforms sa = _run_dash_prioritizer_return_sa(results,value) # the sa object that will be used for displaying sequence ensg = value.split(':')[0] values = dict_exonCoords[ensg] # {E1.1:[attrs]} gene_symbol = collect_gene[collect_uid.index(value)] coord = uid_to_coord(value) valid_indices = _run_dash_prioritizer_return_valid_indices(candidates,collect_uid,value) # list of valid indices # data_exon data_exon = [] for k,v in values.items(): data_exon.append({'subexon':k,'chromosome':v[0],'strand':v[1],'start':v[2],'end':v[3]}) # sort based on coordinate if data_exon[0]['strand'] == '+': data_exon.sort(reverse=False,key=lambda x:x['start']) elif data_exon[0]['strand'] == '-': data_exon.sort(reverse=True,key=lambda x:x['end']) # novel transcript section dropdown_options = [{'label':item,'value':item} for i, item in enumerate(sa.full_length_attrs) if i in valid_indices] # existing isoform section isoforms = dict_uni_fa[ensg] # {acc1:seq,acc1-2:seq} dropdown_options_ei = [{'label':item,'value':item} for item in isoforms.keys()] # exon_h2_value exon_h2_value = '{} ({}) --- Coord: {}'.format(ensg,gene_symbol,coord) # expression plot expr_tumor_dict = sa.ed # {sample:value} expr_tumor_dict = {sample + ',' + 'tumor': value for sample,value in expr_tumor_dict.items()} # {sample,tumor:value} expr_tumor_dict = {k:v for k,v in sorted(expr_tumor_dict.items(),key=lambda x:x[1])} expr_gtex_df = sa.df # index is sample name, two columns: value and tissue expr_gtex_dict = {row.Index + ',' + row.tissue: row.value for row in expr_gtex_df.itertuples()} # {sample,tissue:value} expr_gtex_dict = {k:v for k,v in sorted(expr_gtex_dict.items(),key=lambda x:x[1])} node_x = [] node_y = [] node_text = [] expr_gtex_dict.update(expr_tumor_dict) for i,(k,v) in enumerate(expr_gtex_dict.items()): node_x.append(i) node_y.append(v) node_text.append(k) node_trace = go.Scatter(x=node_x,y=node_y,mode='markers',marker={'color':'red','size':2},text=node_text,hoverinfo='text') fig = go.Figure(data=[node_trace],layout=go.Layout(showlegend=False)) fig.update_xaxes(title_text='Samples(Normal -> Tumor)') fig.update_yaxes(title_text='Raw Read Count') return fig,exon_h2_value,data_exon,dropdown_options,dropdown_options_ei @app.callback(Output('sequence','children'),Output('ensembl','children'),Output('alignment','children'),Input('submit','n_clicks'),State('novel_transcript','value'),State('existing_isoform','value'),) def select_sequence_to_display(n_clicks,value_novel,value_existing): novel_pep_seq = sa.orfp[sa.full_length_attrs.index(value_novel)] exist_pep_seq = isoforms[value_existing] emboss_alignment = run_emboss(asequence=novel_pep_seq,bsequence=exist_pep_seq,python_executable=python_executable) emboss_alignment = emboss_alignment.replace('\n','<br/>') return novel_pep_seq,exist_pep_seq,dash_dangerously_set_inner_html.DangerouslySetInnerHTML(emboss_alignment) if host is None: host = subprocess.run(['hostname'],stdout=subprocess.PIPE,universal_newlines=True).stdout.split('\n')[0] app.run_server(host=host,port=port)
[docs]def run_dash_B_antigen(pkl,prediction_mode,candidates,python_executable,host=None,port='8050'): ''' run the dash B antigen viewer :param pkl: string, the path to the surface B pipeline pickle file :param prediction_mode: string, either short_read or long_read :param candidates: string ,the path to the candidates.txt file :param python_executable: string ,the path to the python executable you are using :param host: None or string, string or None, if None, program will run hostname to automatically detect :param port: string, default is 8050 Example:: surface.run_dash_B_antigen(pkl='result/surface_antigen.p',candidates='result/candidates_5.txt',prediction_mode='long_read', python_executable='/data/salomonis2/LabFiles/Frank-Li/refactor/neo_env/bin/python3.7') ''' if prediction_mode == 'long_read': _run_dash_long_read_mode(pkl,candidates,python_executable,host=host,port=port) elif prediction_mode == 'short_read': import dash from dash import dcc,html,dash_table from dash.dependencies import Input,Output,State import dash_dangerously_set_inner_html import plotly.graph_objects as go with open(pkl,'rb') as f1: results = pickle.load(f1) # a list of sa object sa, df_certain = None, None # a binding for further nonlocal declaration with open(candidates,'r') as f2: candidates = f2.readlines() # a list of each lines, containing the newline symbol collect_uid = _run_dash_prioritizer_return_events(candidates) # a list of all uid collect_gene = _run_dash_prioritizer_return_gene(candidates) # a list of all gene symbol app = dash.Dash(__name__) app.layout = html.Div([ html.Div([html.H2('SNAF B-antigen Viewer'),html.Br(),html.Label('Splicing Event UID: ',style={'font-weight':'bold'}),dcc.Input(id='event_selection',value=collect_uid[0],type='text',style={'width':'40%'})],style={'text-align':'center'}), html.Div([html.Br(),html.H2(id='exon_h2'),html.Br(),html.H2('Expression (Normal --> Tumor)'),html.Br(),dcc.Graph(id='expression')],style={'text-align':'center'}), html.Div([html.H2('All Exons in AltAnalyze Gene Model'),dash_table.DataTable(id='exon',columns=[{'name':column,'id':column} for column in ['subexon','chromosome','strand','start','end']],page_size=10)]), html.Div([html.H2('All related existing transcripts'),dash_table.DataTable(id='transcript',columns=[{'name':column,'id':column} for column in ['index','EnsGID','EnsTID','EnsPID','Exons']])]), html.Br(), html.Hr(), html.Div([html.H2('Transcript index'),dcc.Dropdown(id='valid_indices')],style={'text-align':'center'}), html.Div([html.H2('cDNA or peptide'),dcc.RadioItems(id='display',options=[ {'label':item,'value':item} for item in ['full_length','orft','orfp','junction','score']],value='peptide')],style={'text-align':'center'}), html.Br(), html.Div([html.Button(id='submit',n_clicks=0,children='Submit',style={'width':'10%'})],style={'text-align':'center'}), html.Div([html.H2('Sequence'),html.Br(),html.P(id='sequence')]), html.Div([html.H2('Ensembl Reference'),html.Br(),html.P(id='ensembl')]), html.Div([html.H2('Emboss Needle Alignment (peptide)'),html.Br(),html.P(id='alignment',style={'white-space':'pre','font-family':'monospace'})]), html.Br(), html.Hr(), html.Div([html.H2('Downstream link'), html.A(id='ensembl_link',href='http://useast.ensembl.org/Homo_sapiens/Info/Index',children='Ensembl Human'), html.Br(), html.A(id='Emboss_link',href='https://www.ebi.ac.uk/Tools/psa/emboss_needle/',children='Emboss Peptide Global Alignment'), html.Br(), html.A(id='TMHMM_link',href='https://services.healthtech.dtu.dk/service.php?TMHMM-2.0',children='TMHMM: predicting transmembrane domain'), html.Br(), html.A(id='SABLE_link',href='https://sable.cchmc.org/',children='SABLE: predicting solvebility and secondary structure'), html.Br(), html.A(id='alphafold2_link',href='https://colab.research.google.com/github/sokrypton/ColabFold/blob/main/AlphaFold2.ipynb',children='Colab version of alphafold2'), html.Br(), html.A(id='uniprot_link',href='https://www.uniprot.org/',children='Uniprot for protein'), html.Br(), html.A(id='ModFold_link',href='https://www.reading.ac.uk/bioinf/ModFOLD/',children='ModFold: 3D model assesser')]) ]) # function we need to define for running the app @app.callback(Output('expression','figure'),Output('exon_h2','children'),Output('exon','data'),Output('transcript','data'),Output('valid_indices','options'),Input('event_selection','value')) def select_event_show_table(value): nonlocal sa nonlocal df_certain sa = _run_dash_prioritizer_return_sa(results,value) # the sa object that will be used for displaying sequence ensg = value.split(':')[0] values = dict_exonCoords[ensg] # {E1.1:[attrs]} gene_symbol = collect_gene[collect_uid.index(value)] coord = uid_to_coord(value) valid_indices = _run_dash_prioritizer_return_valid_indices(candidates,collect_uid,value) # list of valid indices # data_exon data_exon = [] for k,v in values.items(): data_exon.append({'subexon':k,'chromosome':v[0],'strand':v[1],'start':v[2],'end':v[3]}) # sort based on coordinate if data_exon[0]['strand'] == '+': data_exon.sort(reverse=False,key=lambda x:x['start']) elif data_exon[0]['strand'] == '-': data_exon.sort(reverse=True,key=lambda x:x['end']) # data transcript df_certain = df_exonlist.loc[df_exonlist['EnsGID']==ensg,:] df_certain = df_certain.iloc[valid_indices,:] df_certain.insert(loc=0,column='index',value=valid_indices) data_transcript = df_certain.to_dict(orient='records') # drop down menu dropdown_options = [{'label':item,'value':item} for item in valid_indices] # exon_h2_value exon_h2_value = '{} ({}) --- Coord: {}'.format(ensg,gene_symbol,coord) # expression plot expr_tumor_dict = sa.ed # {sample:value} expr_tumor_dict = {sample + ',' + 'tumor': value for sample,value in expr_tumor_dict.items()} # {sample,tumor:value} expr_tumor_dict = {k:v for k,v in sorted(expr_tumor_dict.items(),key=lambda x:x[1])} expr_gtex_df = sa.df # index is sample name, two columns: value and tissue expr_gtex_dict = {row.Index + ',' + row.tissue: row.value for row in expr_gtex_df.itertuples()} # {sample,tissue:value} expr_gtex_dict = {k:v for k,v in sorted(expr_gtex_dict.items(),key=lambda x:x[1])} node_x = [] node_y = [] node_text = [] expr_gtex_dict.update(expr_tumor_dict) for i,(k,v) in enumerate(expr_gtex_dict.items()): node_x.append(i) node_y.append(v) node_text.append(k) node_trace = go.Scatter(x=node_x,y=node_y,mode='markers',marker={'color':'red','size':2},text=node_text,hoverinfo='text') fig = go.Figure(data=[node_trace],layout=go.Layout(showlegend=False)) fig.update_xaxes(title_text='Samples(Normal -> Tumor)') fig.update_yaxes(title_text='Raw Read Count') return fig,exon_h2_value,data_exon,data_transcript,dropdown_options @app.callback(Output('sequence','children'),Output('ensembl','children'),Output('alignment','children'),Input('submit','n_clicks'),State('valid_indices','value'),State('display','value'),) def select_sequence_to_display(n_clicks,value_index,value_display): if value_display == 'full_length': sequence = sa.full_length[value_index] enst = df_certain.loc[df_certain['index']==value_index,:]['EnsTID'].values[0] ensembl_sequence = run_ensembl(ens=enst) emboss_alignment = 'only support peptide now' elif value_display == 'orft': sequence = sa.orft[value_index] enst = df_certain.loc[df_certain['index']==value_index,:]['EnsTID'].values[0] ensembl_sequence = run_ensembl(ens=enst) emboss_alignment = 'only support peptide now' elif value_display == 'orfp': sequence = sa.orfp[value_index] ensp = df_certain.loc[df_certain['index']==value_index,:]['EnsPID'].values[0] ensembl_sequence = run_ensembl(ens=ensp) emboss_alignment = run_emboss(asequence=sequence,bsequence=ensembl_sequence,python_executable=python_executable) emboss_alignment = emboss_alignment.replace('\n','<br/>') elif value_display == 'junction': sequence = sa.junction enst = df_certain.loc[df_certain['index']==value_index,:]['EnsTID'].values[0] ensembl_sequence = run_ensembl(ens=enst) emboss_alignment = 'only support peptide now' elif value_display == 'score': sequence = 'Mean expression across GTEx: {} Expression frequency in cancer cohort: {}'.format(sa.score,sa.freq) ensp = df_certain.loc[df_certain['index']==value_index,:]['EnsPID'].values[0] ensembl_sequence = run_ensembl(ens=ensp) emboss_alignment = 'only support peptide now' return sequence,ensembl_sequence,dash_dangerously_set_inner_html.DangerouslySetInnerHTML(emboss_alignment) if host is None: host = subprocess.run(['hostname'],stdout=subprocess.PIPE,universal_newlines=True).stdout.split('\n')[0] app.run_server(host=host,port=port)
def uid_to_coord_regular_intron_retention(uid,only_intron=True): # ENSG44444:E34.4-I34.1 tmp_list = uid.split(':') ensg, exons = tmp_list first, second = exons.split('-') # work on start_coord actual_exon = first try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: chrom = 'unkonwn' strand = 'unknown' start_coord = 'unknown' else: chrom = attrs[0] strand = attrs[1] if strand == '+': start_coord = attrs[2] # start, here is the differece, take the start of the first exon instead of the end else: start_coord = attrs[3] # end, difference, see explanation above # work on end_coord actual_exon = second try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: chrom = 'unkonwn' strand = 'unknown' start_coord = 'unknown' else: if strand == '+': end_coord = attrs[3] # end, here is the difference, see explanation above, take the end of the intron backup_coord = attrs[2] else: end_coord = attrs[2] # start, here is the difference, see explanation above backup_coord = attrs[3] # assemble if strand == '+': if only_intron: assemble = '{}:{}-{}({})'.format(chrom,backup_coord,end_coord,strand) else: assemble = '{}:{}-{}({})'.format(chrom,start_coord,end_coord,strand) else: if only_intron: assemble = '{}:{}-{}({})'.format(chrom,end_coord,backup_coord,strand) else: assemble = '{}:{}-{}({})'.format(chrom,end_coord,start_coord,strand) return assemble def uid_to_coord(uid): tmp_list = uid.split(':') if len(tmp_list) == 2: ensg,exons = tmp_list elif len(tmp_list) == 3: ensg = tmp_list[0] exons = ':'.join(tmp_list[1:]) first,second = exons.split('-') # figure out start_coord if '_' in first: actual_exon,trailing = first.split('_') try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'U' in actual_exon: proxy_exon = list(dict_exonCoords[ensg].keys())[0] attrs = dict_exonCoords[ensg][proxy_exon] chrom = attrs[0] strand = attrs[1] start_coord = trailing else: # probably a rare error chrom = 'unknown' strand = 'unknown' start_coord = 'unknown' else: chrom = attrs[0] strand = attrs[1] start_coord = trailing else: actual_exon = first try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: chrom = 'unkonwn' strand = 'unknown' start_coord = 'unknown' else: chrom = attrs[0] strand = attrs[1] if strand == '+': start_coord = attrs[3] # end else: start_coord = attrs[2] # start # figure out end_coord if '_' in second: actual_exon,trailing = second.split('_') try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'U' in actual_exon: end_coord = trailing elif 'ENSG' in actual_exon: end_coord = trailing else: end_coord = 'unknown' else: end_coord = trailing else: actual_exon = second try: attrs = dict_exonCoords[ensg][actual_exon] except KeyError: if 'ENSG' in actual_exon: ensg_second, actual_exon_second = actual_exon.split(':') attrs = dict_exonCoords[ensg_second][actual_exon_second] if strand == '+': end_coord = attrs[2] # start else: end_coord = attrs[3] # end else: end_coord = 'unknown' else: if strand == '+': end_coord = attrs[2] # start else: end_coord = attrs[3] # end # assemble if strand == '+': assemble = '{}:{}-{}({})'.format(chrom,start_coord,end_coord,strand) else: assemble = '{}:{}-{}({})'.format(chrom,end_coord,start_coord,strand) return assemble def split_array_to_chunks(array,cores=None): if not isinstance(array,list): raise Exception('split_array_to_chunks function works for list, not ndarray') array_index = np.arange(len(array)) if cores is None: cores = mp.cpu_count() sub_indices = np.array_split(array_index,cores) sub_arrays = [] for sub_index in sub_indices: item_in_group = [] for i in sub_index: item_in_group.append(array[i]) sub_arrays.append(item_in_group) return sub_arrays
[docs]def run(uids,outdir,prediction_mode='short_read',n_stride=2,gtf=None,tmhmm=False,software_path=None,serialize=True): ''' main function for run B antigen pipeline :param uids: list, the membrane tuples :param outdir: string, the path where all the output will go into :param prediction_mode: string, either 'short_read' or 'long_read' :param n_stride: int, how many exons define a early stop codon, for NMD check :param gtf: None or string, if prediction mode is long_read, supply the path to the gtf file :param tmhmm: bool, use tmhmm or not :param software_path: None or string, if tmhmm=True, specify the tmhmm software path :param serialize: bool, serialize to the pickle file for the result Examples:: # if using TMHMM surface.run(membrane_tuples,outdir='result',tmhmm=True,software_path='/data/salomonis2/LabFiles/Frank-Li/python3/TMHMM/tmhmm-2.0c/bin/tmhmm') # if not using TMHMM surface.run(membrane_tuples,outdir='result',tmhmm=False,software_path=None) ''' if not os.path.exists(outdir): os.mkdir(outdir) results = [] if prediction_mode == 'short_read': file_name = 'surface_antigen_sr.p' for uid,score,df,ed,freq in tqdm(uids,total=len(uids)): sa = SurfaceAntigen(uid,score,df,ed,freq,False) sa.detect_type() sa.retrieve_junction_seq() sa.recovery_full_length_protein() sa.find_orf() sa.orf_check(n_stride=n_stride) sa.align_uniprot(tmhmm=tmhmm,software_path=software_path) results.append(sa) elif prediction_mode == 'long_read': file_name = 'surface_antigen_lr.p' gtf_dict = process_est_or_long_read_with_id(gtf) for uid,score,df,ed,freq in tqdm(uids,total=len(uids)): sa = SurfaceAntigen(uid,score,df,ed,freq,False) sa.detect_type() sa.retrieve_junction_seq() sa.recovery_full_length_protein_long_read(gtf_dict) # change sa.find_orf() sa.pseudo_orf_check() # change sa.align_uniprot(tmhmm=tmhmm,software_path=software_path) results.append(sa) if serialize: with open(os.path.join(outdir,file_name),'wb') as f: pickle.dump(results,f) # remove scratch name = os.getpid() int_file_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'scratch','{}.fasta'.format(name)) os.remove(int_file_path) return results
def batch_run(uid_list,cores,n_stride,tmhmm=False,software_path=None,serialize=False,outdir='.',name=None): # currently out of active development sub_uid_lists = split_array_to_chunks(uid_list,cores=cores) pool = pool = mp.Pool(processes=cores) r = [pool.apply_async(func=single_run,args=(sub_uid_list,n_stride,tmhmm,software_path,serialize,)) for sub_uid_list in sub_uid_lists] pool.close() pool.join() results = [] for collect in r: result = collect.get() results.extend(result) if name is None: name = 'batch_run_surface_antigen.p' with open(os.path.join(outdir,name),'wb') as f: pickle.dump(results,f) def individual_check(uid,n_stride=2,tmhmm=False,software_path=None,exons=None,indices=[None],fragments=[None]): uid = uid sa = SurfaceAntigen(uid,0,1,False) get_exon_table(uid.split(':')[0]) get_all_transcripts(uid.split(':')[0]) get_existing_isoforms(uid.split(':')[0]) if exons is not None: for exon in exons: print(exon,get_exon_sequence(exon,uid.split(':')[0])) sa.detect_type() sa.retrieve_junction_seq() sa.recovery_full_length_protein() sa.find_orf() sa.orf_check(n_stride=n_stride) sa.align_uniprot(tmhmm=tmhmm,software_path=software_path) for index,fragment in zip(indices,fragments): if index is None: continue else: sa.visualize(index=index,fragment=fragment) return sa def process_est_or_long_read(gtf): gtf_dict = {} with open(gtf,'r') as f: transcript = -1 # interpret as the index of the last-processed transcript for line in f: if transcript > -1: lchrom, lsource, ltyp, lstart, lend, lscore, lstrand, lphase, lattrs = chrom, source, typ, start, end, score, strand, phase, attrs chrom, source, typ, start, end, score, strand, phase, attrs = line.rstrip('\n').split('\t') if typ == 'transcript': if transcript == -1: composition = [] if len(composition) > 1: gtf_dict.setdefault(lchrom,{'+':[],'-':[]})[lstrand].append(composition) composition = [] transcript += 1 elif typ == 'exon': composition.append((int(start),int(end))) else: continue return gtf_dict def process_est_or_long_read_with_id(gtf): gtf_dict = {} with open(gtf,'r') as f: transcript = -1 # interpret as the index of the last-processed transcript for line in f: if transcript > -1: lchrom, lsource, ltyp, lstart, lend, lscore, lstrand, lphase, lattrs = chrom, source, typ, start, end, score, strand, phase, attrs chrom, source, typ, start, end, score, strand, phase, attrs = line.rstrip('\n').split('\t') if typ == 'transcript': if transcript == -1: composition = [attrs] if len(composition) > 1: gtf_dict.setdefault(lchrom,{'+':[],'-':[]})[lstrand].append(composition) composition = [attrs] transcript += 1 elif typ == 'exon': composition.append((int(start),int(end))) else: continue return gtf_dict def is_support_by_est_or_long_read(sa,op,strict=True): coord = uid_to_coord(sa.uid) start_coord, end_coord = coord.split(':')[1].split('(')[0].split('-') start_coord, end_coord = int(start_coord), int(end_coord) ensg = sa.uid.split(':')[0] values = dict_exonCoords[ensg] first_key = list(values.keys())[0] attrs = values[first_key] chrom = attrs[0] strand = attrs[1] transcripts = gtf_dict[chrom][strand] # the first item is attrs candidate_transcripts = [] for transcript in transcripts: transcript_start = int(transcript[1][0]) transcript_end = int(transcript[-1][-1]) if transcript_start > start_coord: break elif transcript_start < start_coord and transcript_end > end_coord: candidate_transcripts.append(transcript) else: continue return_value = False return_cand = None for cand in candidate_transcripts: attrs = cand[0] sequence = '' if strand == '+': for exon in cand[1:]: sequence += query_from_dict_fa(exon[0],exon[1],ensg,strand) else: for exon in cand[::-1][:-1]: sequence += query_from_dict_fa(exon[0],exon[1],ensg,strand) candidate_orfs = transcript2orf(sequence) max_orf = prioritize_orf(candidate_orfs) max_pep = orf2pep(max_orf) # junction site present exon_sites = [] for exon in cand[1:]: exon_sites.extend([int(item) for item in exon]) try: si = exon_sites.index(start_coord) ei = exon_sites.index(end_coord) except ValueError: continue if strict: if ei == si + 1 and max_pep == op: return_value = True return_cand = attrs break else: if ei == si + 1: return_value = True return_cand = attrs break return return_value, return_cand
[docs]def report_candidates(pickle_path,candidates_path,validation_path,freq_df_path,mode,outdir='.',name=None): ''' report SNAF-B antigen candidates, a more organized and tidy file format for all the candidate. :param pickle_path: string ,the path to the saved pickle object from `run` command. :param candidates_path: string, the path to the saved candidates.txt file which we'd like to reformat :param validation_path: string, the path to the saved validation.txt file which accoompanying the candidates.txt file :param freq_df_path: string, the path to the frequency table from SNAF-T pipeline, this will provide the frequency of each neojunction in whole cohort. :param mode: string, either 'long_read' or 'short_read',default is short_read :param outdir: string, the folder for the output directory, will create one if not exist :param name: string, the name of the saved output file. Example:: surface.report_candidates('result/surface_antigen.p','result/candidates_3.txt','result/validation_3.txt', 'result/frequency_stage0_verbosity1_uid_gene_symbol_coord_mean_mle.txt','short_read', 'result','sr_str3_report.txt') ''' if not os.path.exists(outdir): os.mkdir(outdir) with open(pickle_path,'rb') as f1: results = pickle.load(f1) # a list of sa object with open(candidates_path,'r') as f2: candidates = f2.readlines() freq_df = pd.read_csv(freq_df_path,sep='\t',index_col=0) freq_df['uid'] = [item.split(',')[1] for item in freq_df.index] uid_2_ts_mean = pd.Series(index=freq_df['uid'].values,data=freq_df['tumor_specificity_mean'].values).to_dict() uid_2_ts_mle = pd.Series(index=freq_df['uid'].values,data=freq_df['tumor_specificity_mle'].values).to_dict() collect_uid = _run_dash_prioritizer_return_events(candidates) collect_gene = _run_dash_prioritizer_return_gene(candidates) candidate_count = 0 if mode == 'short_read': with open(os.path.join(outdir,name),'w') as f3, open(validation_path,'r') as f4: f3.write('Candidate_id\tNeoJunction\tmode\tevidence\tmRNA_sequence\tpeptide_sequence\tgene_symbol\tcohort_frequency\ttumor_specificity_mean\ttumor_specificity_mle\tvalidation\n') lines = f4.readlines() for uid,gene,line in tqdm(zip(collect_uid,collect_gene,lines),total=len(collect_uid)): ts_mean = uid_2_ts_mean[uid] ts_mle = uid_2_ts_mle[uid] value = uid sa = _run_dash_prioritizer_return_sa(results,value) valid_indices = _run_dash_prioritizer_return_valid_indices(candidates,collect_uid,value) ensg = value.split(':')[0] df_certain = df_exonlist.loc[df_exonlist['EnsGID']==ensg,:] df_certain = df_certain.iloc[valid_indices,:] df_certain.insert(loc=0,column='index',value=valid_indices) for value_index in valid_indices: orft_sequence = sa.orft[value_index] orfp_sequence = sa.orfp[value_index] evidence = df_certain.loc[df_certain['index']==value_index,:]['EnsTID'].values[0] candidate_count += 1 stream = 'candidate{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(candidate_count,uid,mode,evidence,orft_sequence,orfp_sequence,gene,sa.freq,ts_mean,ts_mle,line.rstrip('\n')) f3.write(stream) elif mode == 'long_read': with open(os.path.join(outdir,name),'w') as f3, open(validation_path,'r') as f4: f3.write('Candidate_id\tNeoJunction\tmode\tevidence\tmRNA_sequence\tpeptide_sequence\tgene_symbol\tcohort_frequency\ttumor_specificity_mean\ttumor_specificity_mle\tvalidation\n') lines = f4.readlines() for uid,gene,line in tqdm(zip(collect_uid,collect_gene,lines),total=len(collect_uid)): ts_mean = uid_2_ts_mean[uid] ts_mle = uid_2_ts_mle[uid] value = uid sa = _run_dash_prioritizer_return_sa(results,value) valid_indices = _run_dash_prioritizer_return_valid_indices(candidates,collect_uid,value) for value_index in valid_indices: orft_sequence = sa.orft[value_index] orfp_sequence = sa.orfp[value_index] evidence = sa.full_length_attrs[value_index] candidate_count += 1 stream = 'candidate{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(candidate_count,uid,mode,evidence,orft_sequence,orfp_sequence,gene,sa.freq,ts_mean,ts_mle,line.rstrip('\n')) f3.write(stream)
def overlap_with_extracellular(sa): uid = sa.uid ensg = uid.split(':')[0] event_type = sa.event_type is_overlap = False if event_type != 'intron_retention': junction_span = uid_to_coord(uid) else: junction_span = uid_to_coord_regular_intron_retention(uid) if 'unknown' in junction_span: # give it a chance for manual inspection, since not too many unknown return True else: junction_span = junction_span.split('(')[0].split(':')[1].split('-') ensps = ensg2ensps[ensg] for ensp in ensps: try: regions = ensp2regions[ensp] except KeyError: continue coords = [] for item in regions: coords.extend(list(item)) coords.sort() # ascending, forward string left_insert_index = bisect(coords,int(junction_span[0])) right_insert_index = bisect(coords,int(junction_span[1])) if left_insert_index % 2 == 0: # even number, not in extracellular if right_insert_index > left_insert_index: is_overlap = True else: is_overlap = True return is_overlap def send_or_not(op,ref_seq,style,overlap_extracellular,sa): # compare length if len(op) > len(ref_seq): result = 'insertion' else: result = 'deletion' # whether overlap if overlap_extracellular: is_overlap = overlap_with_extracellular(sa) # default send send = False if style is None: if overlap_extracellular: if is_overlap: send = True else: send = True elif style == result: if overlap_extracellular: if is_overlap: send = True else: send = True return send
[docs]def generate_results(pickle_path,strigency=3,outdir='.',gtf=None,long_read=False,style=None,overlap_extracellular=False): ''' Generate candidates for B antigen :param pickle_path: string, the path to the pickle result file :param strigency: int, from 1-5, how strigent you want to program to be for calling B antigen * strigency1: novel isoform needs to be absent in uniprot database * strigency2: novel isoform also needs to be a documented protein-coding gene * strigency3: novel isoform also needs to not be subjected to Nonsense Mediated Decay (NMD) * strigency4: novel isoform also needs to have long-read or EST support (as long as the novel junction present in full-length) * strigency5: novel isoform also needs to have long-read or EST support (whole ORF needs to be the same as full-length) :param outdir: string, path to the output folder :param gtf: string, if strigency>3, you need to specify the path to the long-read or EST gtf file :param long_read: boolean, whether the last run was using prediction_mode as long_read or not, default to False :param style: None or string, can either be 'deletion' or 'insertion' meaning only generate result with deleted or inserted region :param overlap_extracellular: boolean, default is True, only generate result whose junction overlap with extracellular domain Example:: # if having gtf file for long-read data surface.generate_results(pickle_path='./result/surface_antigen.p',outdir='result',strigency=5,gtf='./SQANTI-all/collapse_isoforms_classification.filtered_lite.gtf') # if not having surface.generate_results(pickle_path='./result/surface_antigen.p',outdir='result',strigency=3,gtf=None) ''' if not os.path.exists(outdir): os.mkdir(outdir) if gtf is not None: global gtf_dict gtf_dict = process_est_or_long_read_with_id(gtf) if overlap_extracellular: # have to load some database global ensg2ensps global ensp2regions ensg2ensps = pd.Series(index=df_exonlist['EnsGID'].values,data=df_exonlist['EnsPID'].values).groupby(level=0).apply(lambda x:x.tolist()).to_dict() df_topology['region'] = [(int(start),int(end)) for start,end in zip(df_topology['genomic_start'],df_topology['genomic_stop'])] ensp2regions = df_topology.groupby(by='ensembl_prot')['region'].apply(lambda x:x.tolist()).to_dict() with open(pickle_path,'rb') as f: results = pickle.load(f) count_candidates = 0 count_further = 0 candidates = [] with open(os.path.join(outdir,'further.txt'),'w') as f2: for sa in results: uid = sa.uid ensg = sa.uid.split(':')[0] ref_seq = list(dict_uni_fa[ensg].items())[0][1] valid_indices = [] if len(sa.comments) > 0: print(sa,file=f2) count_further += 1 else: sends = [] for i,(op,n,t,a) in enumerate(zip(sa.orfp,sa.nmd,sa.translatability,sa.alignment)): if strigency == 5: if n == '#' and t == '#' and a==True: value,cand_attrs = is_support_by_est_or_long_read(sa,op,strict=True) if value: send = send_or_not(op,ref_seq,style,overlap_extracellular,sa) sends.append(send) if send: valid_indices.append(i) elif strigency == 4: if n == '#' and t == '#' and a==True: value,cand_attrs = is_support_by_est_or_long_read(sa,op,strict=False) if value: send = send_or_not(op,ref_seq,style,overlap_extracellular,sa) sends.append(send) if send: valid_indices.append(i) elif strigency == 3: if n == '#' and t == '#' and a==True: send = send_or_not(op,ref_seq,style,overlap_extracellular,sa) sends.append(send) if send: valid_indices.append(i) cand_attrs = None elif strigency == 2: # out of development if t == '#' and a==True: send = compare_length(op,ref_seq,style) if send: valid_indices.append(i) cand_attrs = None elif strigency == 1: # out of development if a==True: send = compare_length(op,ref_seq,style) if send: valid_indices.append(i) cand_attrs = None if any(sends): candidates.append((sa,sa.score,sa.freq,len(valid_indices),sa.uid,valid_indices,cand_attrs)) count_candidates += 1 if count_candidates > 0: sorted_candidates = sorted(candidates,key=lambda x:(x[1],-x[2],-x[3]),reverse=False) uid_list = list(list(zip(*sorted_candidates))[4]) ensg_list = [uid.split(':')[0] for uid in uid_list] gene_symbols = ensemblgene_to_symbol(ensg_list,'human') if long_read: file_name_lr = '_lr' else: file_name_lr = '_sr' with open(os.path.join(outdir,'candidates_{}{}_{}_{}.txt'.format(strigency,file_name_lr,style,overlap_extracellular)),'w') as f1, open(os.path.join(outdir,'validation_{}{}_{}_{}.txt'.format(strigency,file_name_lr,style,overlap_extracellular)),'w') as f3: for (sa,score,freq,hit,uid,vi,ca),gene in zip(sorted_candidates,gene_symbols): print(sa,'valid_indices:{}\n'.format(vi),'gene_symbol:{}\n'.format(gene),file=f1,sep='',end='\n') print(ca,file=f3,sep='\n',end='\n') else: print('no candidates for strigency {} style {} overlap_extracellular {}'.format(strigency, style, overlap_extracellular)) return count_candidates,count_further
def ensemblgene_to_symbol(query,species): ''' Examples:: from sctriangulate.preprocessing import GeneConvert converted_list = GeneConvert.ensemblgene_to_symbol(['ENSG00000010404','ENSG00000010505'],species='human') ''' # assume query is a list, will also return a list import mygene mg = mygene.MyGeneInfo() out = mg.querymany(query,scopes='ensemblgene',fileds='symbol',species=species,returnall=True,as_dataframe=True,df_index=True) result = out['out']['symbol'].fillna('unknown_gene').tolist() try: assert len(query) == len(result) except AssertionError: # have duplicate results df = out['out'] df_unique = df.loc[~df.index.duplicated(),:] result = df_unique['symbol'].fillna('unknown_gene').tolist() return result def get_exon_table(ensgid,outdir='.'): values = dict_exonCoords[ensgid] with open(os.path.join(outdir,'{}_exon_table.txt'.format(ensgid)),'w') as f: f.write('subexon\tchrom\tstrand\tstart\tend\tsuffer\n') for k,v in values.items(): f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(k,v[0],v[1],v[2],v[3],v[4])) def get_all_transcripts(ensgid,outdir='.'): df_certain = df_exonlist.loc[df_exonlist['EnsGID']==ensgid,:] df_certain.to_csv(os.path.join(outdir,'{}_all_transcripts.txt'.format(ensgid)),sep='\t',index=None) def get_existing_isoforms(ensgid,outdir='.'): with open(os.path.join(outdir,'{}_existing_isoforms.fasta'.format(ensgid)),'w') as f: for k,v in dict_uni_fa[ensgid].items(): f.write('>{}\n{}\n'.format(k,v)) def get_exon_sequence(exon,ensgid): attrs = dict_exonCoords[ensgid][exon] return query_from_dict_fa(attrs[2],attrs[3],ensgid,attrs[1]) def filter_to_membrane_protein(lis): filtered_lis = [] all_membrane = set(dict_uni_fa.keys()) for uid in lis: ensgid = uid.split(':')[0] if ensgid in all_membrane: filtered_lis.append(uid) return filtered_lis class SurfaceAntigen(object): def __init__(self,uid,score,df,ed,freq,check_overlap=True): self.uid = uid self.score = score self.df = df self.ed = ed self.freq = freq self.comments = [] if check_overlap: if not self.is_membrane_protein(): raise Exception('This event will not encode a surface protein') def is_membrane_protein(self): ensgid = self.uid.split(':')[0] if ensgid in set(df_membrane_proteins['Ens'].tolist()): return True else: return False def __str__(self): print_str = 'uid:{}\n'.format(self.uid) print_str += 'scores and freqs:{}\n'.format(','.join([str(self.score),str(self.freq)])) print_str += 'comments:{}\n'.format(self.comments) try: print_event_type = self.event_type except AttributeError: print_event_type = None print_str += 'event type:{}\n'.format(print_event_type) try: print_junction = self.junction[:5] + '...' + self.junction[-5:] except AttributeError: print_junction = None print_str += 'Junction:{}\n'.format(print_junction) try: if self.full_length == ['unrecoverable']: print_full_length = (len(self.full_length),self.full_length) else: print_full_length = (len(self.full_length),[i for i,item in enumerate(self.full_length) if item != '']) except AttributeError: print_full_length = (None,None) print_str += 'Full length transcripts: length {}, indices {}\n'.format(print_full_length[0],print_full_length[1]) try: if self.orft == ['unrecoverable']: print_orft = (len(self.orft),self.orft) else: print_orft = (len(self.orft),[i for i,item in enumerate(self.orft) if item != '']) except AttributeError: print_orft = (None,None) print_str += 'ORF transcripts: length {}, indices {}\n'.format(print_orft[0],print_orft[1]) try: if self.orfp == ['unrecoverable']: print_orfp = (len(self.orfp),self.orfp) else: print_orfp = (len(self.orfp),[i for i,item in enumerate(self.orfp) if item != '']) except AttributeError: print_orfp = (None,None) print_str += 'ORF peptides: length {}, indices {}\n'.format(print_orfp[0],print_orfp[1]) try: if self.nmd == ['unrecoverable']: print_nmd = (len(self.nmd),self.nmd) else: print_nmd = (len(self.nmd),[item for item in self.nmd if item != '']) except AttributeError: print_nmd =(None,None) print_str += 'NMD check: length {}, indices {}\n'.format(print_nmd[0],print_nmd[1]) try: if self.translatability == ['unrecoverable']: print_translatability = (len(self.translatability),self.translatability) else: print_translatability = (len(self.translatability),[item for item in self.translatability if item != '']) except AttributeError: print_translatability = (None,None) print_str += 'tranlatability check: length {}, indices {}\n'.format(print_translatability[0],print_translatability[1]) try: if self.alignment == ['unrecoverable']: print_alignment = (len(self.alignment),self.alignment) else: print_alignment = (len(self.alignment),[item for item in self.alignment if item != '']) except AttributeError: print_alignment = (None,None) print_str += 'Alignment: length {}, indices {}\n'.format(print_alignment[0],print_alignment[1]) return print_str def detect_type(self): ''' Ordinary: ENSG00000107902:E10.1-E12.1 Alt3: ENSG00000110057:E5.1-E6.2_67996641 Alt5: ENSG00000100321:E7.1_39364266-E8.1 Intron Retention: ENSG00000115524:I4.1-E5.1 Novel Exon: ENSG00000008441:I40.1_13076665-E41.1 Trans-splicing: ENSG00000196565:E14.2-ENSG00000213934:E3.1 UTR Event: ENSG00000164068:U0.1_49689185-E2.1 ''' valid_pattern = re.compile(r'^ENSG\d+:.+?-.+') if re.search(valid_pattern,self.uid): # at least valid one if len(re.findall('ENSG',self.uid)) == 2: event_type = 'trans_splicing' elif 'U' in self.uid: event_type = 'utr_event' elif '_' in self.uid: subexon12 = self.uid.split(':')[1] subexon1, subexon2 = subexon12.split('-') if 'I' in subexon12: event_type = 'novel_exon' elif '_' in subexon1 and '_' in subexon2: event_type = 'alt5_alt3' elif '_' in subexon1 and '_' not in subexon2: event_type = 'alt5' elif '_' in subexon2 and '_' not in subexon1: event_type = 'alt3' else: event_type = 'invalid' elif 'I' in self.uid: event_type = 'intron_retention' elif re.search(r'^ENSG\d+:E\d+\.\d+-E\d+\.\d+$',self.uid): e = self.uid.split(':')[1] e1 = e.split('-')[0] e2 = e.split('-')[1] e1_int = int(e1.split('.')[0][1:]) e1_frac = int(e1.split('.')[1]) e2_int = int(e2.split('.')[0][1:]) e2_frac = int(e2.split('.')[1]) if e1 == e2: # E5.1-E5.1 event_type = 'invalid' else: if e1_int > e2_int or (e1_int==e2_int and e1_frac>e2_frac): # E13.1-E12.4 or E12.10-E12.9 event_type = 'invalid' else: event_type = 'ordinary' else: event_type = 'invalid' else: event_type = 'invalid' self.event_type = event_type return event_type def retrieve_junction_seq(self): if self.event_type != 'invalid': ensid = self.uid.split(':')[0] subexon1,subexon2 = ':'.join(self.uid.split(':')[1:]).split('-') seq1 = subexon_tran(subexon1,ensid,'site1') seq2 = subexon_tran(subexon2,ensid,'site2') junction = ','.join([seq1,seq2]) self.junction = junction else: self.junction = '$' * 10 # indicating invalid uid def recovery_full_length_protein(self): if '$' not in self.junction and '*' not in self.junction and '#' not in self.junction: ensgid = self.uid.split(':')[0] exons = ':'.join(self.uid.split(':')[1:]) if self.event_type == 'ordinary': full_transcript_store,comments = recover_ordinary(ensgid,exons) self.comments.extend(comments) elif self.event_type == 'alt5' or self.event_type == 'alt3' or self.event_type == 'alt3_alt5': full_transcript_store,comments = recover_alt(ensgid,exons) self.comments.extend(comments) else: # novel_exon and utr_event and intron retention and trans-splicing full_transcript_store = ['unrecoverable'] else: full_transcript_store = ['unrecoverable'] self.full_length = full_transcript_store def recovery_full_length_protein_long_read(self,gtf_dict): if '$' not in self.junction and '*' not in self.junction and '#' not in self.junction: coord = uid_to_coord(self.uid) start_coord, end_coord = coord.split(':')[1].split('(')[0].split('-') start_coord, end_coord = int(start_coord), int(end_coord) ensg = self.uid.split(':')[0] values = dict_exonCoords[ensg] first_key = list(values.keys())[0] attrs = values[first_key] chrom = attrs[0] strand = attrs[1] transcripts = gtf_dict[chrom][strand] candidate_transcripts = [] for transcript in transcripts: # here, each transcript is [attrs,(start,end),(start,end)] transcript_start = int(transcript[1][0]) transcript_end = int(transcript[-1][-1]) if transcript_start > start_coord: break elif transcript_start < start_coord and transcript_end > end_coord: candidate_transcripts.append(transcript) else: continue full_transcript_store = [] full_transcript_attrs = [] for cand in candidate_transcripts: # here, each cand is [attrs,(start,end),(start,end)] attrs = cand[0] sequence = '' if strand == '+': for exon in cand[1:]: sequence += query_from_dict_fa(exon[0],exon[1],ensg,strand) else: for exon in cand[::-1][:-1]: sequence += query_from_dict_fa(exon[0],exon[1],ensg,strand) # junction site present exon_sites = [] for exon in cand[1:]: exon_sites.extend([int(item) for item in exon]) try: si = exon_sites.index(start_coord) ei = exon_sites.index(end_coord) except ValueError: if self.event_type == 'intron_retention': # give intron retention another chance, as in normal uid_to_coord, intron retention will only be chrx:a-a+1 ir_coord = uid_to_coord_regular_intron_retention(self.uid,only_intron=True) ir_start_coord, ir_end_coord = ir_coord.split(':')[1].split('(')[0].split('-') ir_start_coord, ir_end_coord = int(ir_start_coord), int(ir_end_coord) # new way, whether a pacbio exon completely includes the intron t = len(exon_sites) for i in range(0,t,2): lr_exon_s = exon_sites[i] lr_exon_e = exon_sites[i+1] if lr_exon_s <= ir_start_coord and lr_exon_e >= ir_end_coord: full_transcript_store.append(sequence) full_transcript_attrs.append(attrs) continue else: continue if ei == si + 1: full_transcript_store.append(sequence) full_transcript_attrs.append(attrs) else: full_transcript_store = ['unrecoverable'] full_transcript_attrs = ['unrecoverable'] self.full_length = full_transcript_store self.full_length_attrs = full_transcript_attrs def find_orf(self): orft_list = [] orfp_list = [] for sequence in self.full_length: if sequence != 'unrecoverable' and sequence != '': if len(sequence) > 10000: orft_list.append('') orfp_list.append('') self.comments.append('full_transcript > 10000') else: candidate_orfs = transcript2orf(sequence) max_orf = prioritize_orf(candidate_orfs) max_pep = orf2pep(max_orf) orft_list.append(max_orf) orfp_list.append(max_pep) else: orft_list.append(sequence) orfp_list.append(sequence) self.orft = orft_list self.orfp = orfp_list def pseudo_orf_check(self): total_length = len(self.full_length) self.nmd = ['#'] * total_length self.translatability = ['#'] * total_length def orf_check(self,n_stride): set_global_env(df_exonlist,dict_exonCoords,dict_fa,dict_biotype) nmd_check_result = nmd_check(self.uid,self.full_length,self.orft,n_stride) translatability_check_result = translatability_check(self.uid,self.orft) self.nmd = nmd_check_result self.translatability = translatability_check_result def align_uniprot(self,tmhmm,software_path=None): results = alignment_to_uniprot(self.orfp,self.uid,dict_uni_fa,tmhmm,software_path) self.alignment = results if tmhmm: subprocess.run('rm -r ./TMHMM_*',shell=True) def visualize(self,index,outdir='.',name=None,fragment=None): full_length = self.full_length[index] orft = self.orft[index] ensgid = self.uid.split(':')[0] exonlist = df_exonlist.loc[df_exonlist['EnsGID']==ensgid,:]['Exons'].iloc[index].split('|') if full_length == '' or orft == '': raise Exception('please select index that are not empty based on SurfaceAntigen summary') else: import matplotlib.pyplot as plt from matplotlib.patches import Rectangle,Patch l = len(full_length) start_o = full_length.index(orft) end_o = start_o + (len(orft)-1) fig,ax = plt.subplots() ax.set_xlim(-0.05,1.05) ax.set_ylim(-0.05,1.05) # draw full_length rect_full_length = Rectangle((0,0.8),1,0.1,linewidth=0.5,facecolor='g',edgecolor='k') ax.add_patch(rect_full_length) # draw orft rect_orft = Rectangle((start_o/l,0.6),(end_o-start_o)/l,0.1,linewidth=0.5,facecolor='orange',edgecolor='k') ax.add_patch(rect_orft) # draw junction junction = ''.join([self.junction.split(',')[0][1:],self.junction.split(',')[1][:-1]]) start_j = full_length.index(junction) end_j = start_j + (len(junction)-1) rect_junction = Rectangle((start_j/l,0.4),(end_j-start_j)/l,0.1,linewidth=0.5,facecolor='r',edgecolor='k') ax.add_patch(rect_junction) # draw exonlist for i,exon in enumerate(exonlist): attrs = dict_exonCoords[ensgid][exon] seq = query_from_dict_fa(int(attrs[2])+1,int(attrs[3])-1,ensgid,attrs[1]) try: start_s = full_length.index(seq) except ValueError: # say E9.2-E13.1 is the novel splicing event, skip the E11.1 in the list, so E11.1 won't align continue end_s = start_s + (len(seq)-1) if i % 2 == 0: rect_seq = Rectangle((start_s/l,0.2),(end_s-start_s)/l,0.1,linewidth=0.1,facecolor='pink',edgecolor='k') else: rect_seq = Rectangle((start_s/l,0.2),(end_s-start_s)/l,0.1,linewidth=0.1,facecolor='b',edgecolor='k') ax.add_patch(rect_seq) ax.text(x=(start_s + end_s)/2/l,y=0.3,s=exon,rotation=90,fontsize=2,va='bottom') # draw fragment if fragment is not None: start_f = full_length.index(fragment) end_f = start_f + (len(fragment)-1) rect_fragment = Rectangle((start_f/l,0.0),(end_f-start_f)/l,0.1,linewidth=0.5,facecolor='magenta',edgecolor='k') ax.add_patch(rect_fragment) # draw legend ax.legend(handles=[Patch(color=i) for i in ['g','orange','r','magenta']],labels=['transcript','ORF','junction','fragment'], bbox_to_anchor=(1,1),loc='upper left',frameon=False) # draw title ax.set_title('{}_{}'.format(self.uid,index)) if name is None: name = '{}_{}.pdf'.format(self.uid.replace(':','_'),index) plt.savefig(os.path.join(outdir,name),bbox_inches='tight') plt.close() # standalone functions def subexon_tran(subexon,EnsID,flag): # flag either site1 or site2 ''' 1. subexon can take multiple forms depending on the event type E1.2 or I3.4 E6.2_67996641 or I40.1_13076665, also depending on whether they are subexon1 or subexon2 ENSG00000213934:E3.1 or ENSG00000213934:E2.1_473843893894 U0.1_49689185 2. everything with trailing suffix will depend on the subexon1 or subexon2, but sometimes, it is fixed (trans-splicing can only be in subexon2) 3. to be clear, the exon_seq returned is always 5'-3' sequence, not forward anymore. ''' try: # E1.2 or I3.4 attrs = dict_exonCoords[EnsID][subexon] # [chr,strand,start,end,suffer] exon_seq = query_from_dict_fa(attrs[2],attrs[3],EnsID,attrs[1]) except KeyError: if ':' in subexon: # ENSG00000213934:E3.1 fusionGeneEnsID = subexon.split(':')[0] fusionGeneExon = subexon.split(':')[1] if '_' in fusionGeneExon: # ENSG:E2.1_473843893894 suffix = fusionGeneExon.split('_')[1] subexon = fusionGeneExon.split('_')[0] attrs = dict_exonCoords[fusionGeneEnsID][subexon] if attrs[1] == '+': exon_seq = query_from_dict_fa(suffix,attrs[3],fusionGeneEnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],suffix,fusionGeneEnsID,attrs[1]) else: # ENSG:E2.1 try: attrs = dict_exonCoords[fusionGeneEnsID][fusionGeneExon] except KeyError: exon_seq = '*' * 10 # indicator for error on MultiPath-PSI itself else: exon_seq = query_from_dict_fa(attrs[2],attrs[3],fusionGeneEnsID,attrs[1]) else: # could be trailing or utr, or non-existing ordinary subexon try: suffix = subexon.split('_')[1] except IndexError: # the logic is there's a subexon E45.3, it is no trailing, but just not in the exonCoords. exon_seq = '*' * 10 # indicator for error on MultiPath-PSI itself else: subexon = subexon.split('_')[0] try: attrs = dict_exonCoords[EnsID][subexon] except KeyError: # must be UTR chrUTR,strandUTR = utrAttrs(EnsID) # this is get from a random subexon under that EnsID exon_seq = utrJunction(suffix,EnsID,strandUTR,chrUTR,flag) else: # must be trailing if flag == 'site2': if attrs[1] == '+': exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1]) elif flag == 'site1': # not affected by overhang since it is site1 if attrs[1] == '+': exon_seq = query_from_dict_fa(attrs[2],suffix,EnsID,attrs[1]) else: exon_seq = query_from_dict_fa(suffix,attrs[3],EnsID,attrs[1]) return exon_seq def retrieveSeqFromUCSCapi(chr_,start,end): url = 'http://genome.ucsc.edu/cgi-bin/das/hg38/dna?segment={0}:{1},{2}'.format(chr_,start,end) response = requests.get(url) status_code = response.status_code assert status_code == 200 try: my_dict = xmltodict.parse(response.content) except: exon_seq = '#' * 10 # indicating the UCSC doesn't work return exon_seq exon_seq = my_dict['DASDNA']['SEQUENCE']['DNA']['#text'].replace('\n','').upper() return exon_seq def utrAttrs(EnsID): # try to get U0.1's attribute, but dict_exonCoords doesn't have, so we just wanna get the first entry for its EnsGID exonDict = dict_exonCoords[EnsID] attrs = next(iter(exonDict.values())) chr_,strand = attrs[0],attrs[1] return chr_,strand def utrJunction(site,EnsGID,strand,chr_,flag,seq_len=100): # U0.1_438493849, here 438493849 means the site (suffix) if flag == 'site1' and strand == '+': # U0.1_438493849 - E2.2 otherSite = int(site) - seq_len + 1 # extract UTR with length = 100 exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site)) elif flag == 'site1' and strand == '-': otherSite = int(site) + seq_len - 1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite)) exon_seq = str(Seq(exon_seq).reverse_complement()) elif flag == 'site2' and strand == '+': # E5.3 - U5.4_48374838 otherSite = int(site) + seq_len -1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(site),int(otherSite)) elif flag == 'site2' and strand == '-': otherSite = int(site) - seq_len + 1 exon_seq = retrieveSeqFromUCSCapi(chr_,int(otherSite),int(site)) exon_seq = str(Seq(exon_seq).reverse_complement()) return exon_seq def query_from_dict_fa(abs_start,abs_end,EnsID,strand): ''' abs_start and abs_end always means the xth base in forward strand the returned exon_seq, however, means the 5'-3' seq depending on the strand information. ''' if strand == '+': start = int(dict_fa[EnsID][1]) end = int(dict_fa[EnsID][2]) seq = dict_fa[EnsID][3] start_index = int(abs_start) - start + 2000 end_index = int(abs_end) - start + 1 + 2000 exon_seq = seq[start_index:end_index] elif strand == '-': start = int(dict_fa[EnsID][1]) end = int(dict_fa[EnsID][2]) seq_reverse = dict_fa[EnsID][3] seq_forward = str(Seq(seq_reverse).reverse_complement()) # Hs_gene.fa restore the reverse strand info start_index = int(abs_start) - start + 2000 end_index = int(abs_end) - start + 1 + 2000 # endpoint in python is non-inclusive exon_seq_1 = seq_forward[start_index:end_index] s = Seq(exon_seq_1) exon_seq = str(s.reverse_complement()) return exon_seq def recover_alt(ensgid,exons): # exons is E4.1_888888-E5.6 or E4.1-E5.6_888888 or E4.1_888888-E5.6_8888888 # match E4.1 and E5.6 separately, and incorporate the trailing coordinates while pasting the sequence # if you want to debug, just print out each exon right after pointer_down comments = [] e1,e2 = exons.split('-') # for e1 if '_' not in e1: e1_exon, e1_trailing = e1,None else: e1_exon, e1_trailing = e1.split('_') # for e2 if '_' not in e2: e2_exon, e2_trailing = e2,None else: e2_exon, e2_trailing = e2.split('_') strand = dict_exonCoords[ensgid][e1_exon][1] full_transcript_store = [] # ['',full_transcript1_seq,...] df_certain = df_exonlist.loc[df_exonlist['EnsGID']==ensgid,:] pattern1_1 = re.compile(r'{}\|'.format(e1_exon)) pattern2_1 = re.compile(r'{}\|'.format(e2_exon)) pattern2_2 = re.compile(r'{}$'.format(e2_exon)) for i,item in enumerate(df_certain['Exons']): match1 = re.search(pattern1_1,item) match2 = re.search(pattern2_1,item) or re.search(pattern2_2,item) condition = match1 and match2 if condition: if match1.start() >= match2.start(): # E16.2|E16.3|E23.1|E35.10|E35.11|E36.1|E37.1|E38.1|E39.1|E40.1|E41.1|E42.2|E43.1|E43.2|E43.4|E43.5|E2.4|E5.1|E5.2|E5.3|E5.4|E5.5 full_transcript_store.append('') comments.append('wonky ordered database') else: full_transcript = '' exonlist = iter(item.split('|')) l_exon = next(exonlist,'end') l_exon_int = int(l_exon.split('.')[0][1:]) l_exon_frac = int(l_exon.split('.')[1]) pointer_up = dict_exonCoords[ensgid][l_exon][2] if strand == '+' else dict_exonCoords[ensgid][l_exon][3] while True: n_exon = next(exonlist,'end') if n_exon == 'end': pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq break if n_exon == e1_exon: n_exon_int = int(n_exon.split('.')[0][1:]) n_exon_frac = int(n_exon.split('.')[1]) if n_exon_int > l_exon_int or (n_exon_int == l_exon_int and n_exon_frac > l_exon_frac + 1): # solve the one before the e1 pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq pointer_up = dict_exonCoords[ensgid][n_exon][2] if strand == '+' else dict_exonCoords[ensgid][n_exon][3] # now solve e1 itself if e1_trailing is None: pointer_down = dict_exonCoords[ensgid][n_exon][3] if strand == '+' else dict_exonCoords[ensgid][n_exon][2] else: pointer_down = e1_trailing frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq else: # E3.4|E3.5|E4.1|E4.2|E5.1|E5.2|E6.1|E6.2, e1 is E5.2, e2 is E6.2 # solve the one before e1 and e1 together if e1_trailing is None: pointer_down = dict_exonCoords[ensgid][n_exon][3] if strand == '+' else dict_exonCoords[ensgid][n_exon][2] else: pointer_down = e1_trailing frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq while True: # skip till e2 n_exon = next(exonlist,'end') if n_exon == e2_exon: l_exon = n_exon l_exon_int = int(l_exon.split('.')[0][1:]) l_exon_frac = int(l_exon.split('.')[1]) if e2_trailing is None: pointer_up = dict_exonCoords[ensgid][l_exon][2] if strand == '+' else dict_exonCoords[ensgid][l_exon][3] else: pointer_up = e2_trailing break continue else: n_exon_int = int(n_exon.split('.')[0][1:]) n_exon_frac = int(n_exon.split('.')[1]) if n_exon_int > l_exon_int or (n_exon_int == l_exon_int and n_exon_frac > l_exon_frac + 1): pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq pointer_up = dict_exonCoords[ensgid][n_exon][2] if strand == '+' else dict_exonCoords[ensgid][n_exon][3] l_exon = n_exon l_exon_int = n_exon_int l_exon_frac = n_exon_frac else: l_exon = n_exon l_exon_frac = n_exon_frac full_transcript_store.append(full_transcript) else: full_transcript_store.append('') return full_transcript_store,comments def recover_ordinary(ensgid,exons,must_novel=True): # exons is E4.1-E5.6 # match E4.1 and E5.6 separately # if you want to debug, just print out each exon right after pointer_down comments = [] e1,e2 = exons.split('-') strand = dict_exonCoords[ensgid][e1][1] full_transcript_store = [] # ['',full_transcript1_seq,...] df_certain = df_exonlist.loc[df_exonlist['EnsGID']==ensgid,:] pattern1_1 = re.compile(r'{}\|'.format(e1)) pattern2_1 = re.compile(r'{}\|'.format(e2)) pattern2_2 = re.compile(r'{}$'.format(e2)) pattern3_1 = re.compile(r'{}\|{}\|'.format(e1,e2)) pattern3_2 = re.compile(r'{}\|{}$'.format(e1,e2)) for i,item in enumerate(df_certain['Exons']): match1 = re.search(pattern1_1,item) match2 = re.search(pattern2_1,item) or re.search(pattern2_2,item) match3 = re.search(pattern3_1,item) or re.search(pattern3_2,item) if must_novel: condition = (match1 and match2) and (not match3) else: condition = match1 and match2 if condition: if match1.start() >= match2.start(): # E16.2|E16.3|E23.1|E35.10|E35.11|E36.1|E37.1|E38.1|E39.1|E40.1|E41.1|E42.2|E43.1|E43.2|E43.4|E43.5|E2.4|E5.1|E5.2|E5.3|E5.4|E5.5 full_transcript_store.append('') comments.append('wonky ordered database') else: full_transcript = '' exonlist = iter(item.split('|')) l_exon = next(exonlist,'end') l_exon_int = int(l_exon.split('.')[0][1:]) l_exon_frac = int(l_exon.split('.')[1]) pointer_up = dict_exonCoords[ensgid][l_exon][2] if strand == '+' else dict_exonCoords[ensgid][l_exon][3] while True: n_exon = next(exonlist,'end') if n_exon == 'end': pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq break if n_exon == e1: n_exon_int = int(n_exon.split('.')[0][1:]) n_exon_frac = int(n_exon.split('.')[1]) if n_exon_int > l_exon_int or (n_exon_int == l_exon_int and n_exon_frac > l_exon_frac + 1): # solve the one before the e1 pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq pointer_up = dict_exonCoords[ensgid][n_exon][2] if strand == '+' else dict_exonCoords[ensgid][n_exon][3] # now solve e1 itself pointer_down = dict_exonCoords[ensgid][n_exon][3] if strand == '+' else dict_exonCoords[ensgid][n_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq else: # E3.4|E3.5|E4.1|E4.2|E5.1|E5.2|E6.1|E6.2, e1 is E5.2, e2 is E6.2 # solve the one before e1 and e1 together pointer_down = dict_exonCoords[ensgid][n_exon][3] if strand == '+' else dict_exonCoords[ensgid][n_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq while True: # skip till e2 n_exon = next(exonlist,'end') if n_exon == e2: l_exon = n_exon l_exon_int = int(l_exon.split('.')[0][1:]) l_exon_frac = int(l_exon.split('.')[1]) pointer_up = dict_exonCoords[ensgid][l_exon][2] if strand == '+' else dict_exonCoords[ensgid][l_exon][3] break continue else: n_exon_int = int(n_exon.split('.')[0][1:]) n_exon_frac = int(n_exon.split('.')[1]) if n_exon_int > l_exon_int or (n_exon_int == l_exon_int and n_exon_frac > l_exon_frac + 1): pointer_down = dict_exonCoords[ensgid][l_exon][3] if strand == '+' else dict_exonCoords[ensgid][l_exon][2] frag_seq = query_from_dict_fa(pointer_up,pointer_down,ensgid,strand) if strand == '+' else query_from_dict_fa(pointer_down,pointer_up,ensgid,strand) full_transcript += frag_seq pointer_up = dict_exonCoords[ensgid][n_exon][2] if strand == '+' else dict_exonCoords[ensgid][n_exon][3] l_exon = n_exon l_exon_int = n_exon_int l_exon_frac = n_exon_frac else: l_exon = n_exon l_exon_frac = n_exon_frac full_transcript_store.append(full_transcript) else: full_transcript_store.append('') return full_transcript_store,comments