Source code for camparee.allelic_imbalance_quant

import argparse
import re
import sys
import os
import collections

from pysam import AlignmentFile


OUTPUT_ALLELIC_IMBALANCE_FILE_NAME = "allelic_imbalance_quantifications.txt"

[docs]class AllelicImbalanceQuantificationStep: """ This class contains scripts to output quantification of allelic imbalance """ def __init__(self, sample_directory, sample): """ The object is constructed with (i) an input file source for gene info (ii) Root of the aligned filenames (alignment to transcriptome of each parent with suffixes '_1','_2'.) There is one output file with quantification information on the allelic imbalance of genes. The output file is called 'allelic_imbalance_quantifications.txt' :param geneinfo_filename: input information about the genes - fields are (chromosome, strand, start, end, exon count, exon starts, exon ends, gene name) :param align_filename_root: input information about the alignments to the two transcriptomes, one for each parent, created using gene_files_preparation.py """ self.sample_directory = sample_directory self.geneinfo_filename_1 = os.path.join(self.sample_directory, 'updated_annotation_1.txt') self.geneinfo_filename_2 = os.path.join(self.sample_directory, 'updated_annotation_2.txt') self.genome_alignment_file = os.path.join(self.sample_directory, "genome_alignment.Aligned.sortedByCoord.out.bam") # Check if user provided bam file. if sample.bam_file_path: self.genome_alignment_file = sample.bam_file_path self.align_filename_1 = os.path.join(self.sample_directory, '1_Aligned.out.sam') self.align_filename_2 = os.path.join(self.sample_directory, '2_Aligned.out.sam') # Create allelic imbalance distribution file and ensure that it doesn't currently exist self.allele_imbalance_dist_filename = os.path.join(sample_directory, OUTPUT_ALLELIC_IMBALANCE_FILE_NAME) try: os.remove(self.allele_imbalance_dist_filename) except OSError: pass # Dictionaries to map transcripts to genes and keep track of final count of reads mapped to a gene # This procedure does not map all gene info keys used. Consequently we need to insure that # assignments using new keys are initialized to 0 self.transcript_gene_map = collections.defaultdict(str) self.gene_final_count = collections.defaultdict(lambda: collections.defaultdict(int)) self.exclusive_genes = []
[docs] def create_transcript_gene_map(self): """ Create dictionary to map transcript id to gene id using geneinfo file Map '*' to '*' to account for unmapped reads in align_file Create entries with suffix '_1' and '_2' for each transcript """ self.transcript_gene_map['*'] = '*' with open(self.geneinfo_filename_1, 'r') as geneinfo_file: next(geneinfo_file) for line in geneinfo_file: fields = line.strip('\n').split('\t') self.transcript_gene_map[fields[7]] = fields[8]
def reads_to_ignore(self): reads_to_ignore = [] bamfile = AlignmentFile(self.genome_alignment_file, "rb") num_hits_pattern = re.compile('(NH:i:)(\d+)') for read in bamfile.fetch(until_eof=True): num_hits = dict(read.tags)['NH'] if num_hits > 1: reads_to_ignore.append(read.query_name) return reads_to_ignore
[docs] def read_info(self, in_align_filename): """ Create dictionary which maps a read id in SAM file to a dictionary with two keys 'transcript_id' and 'NM'. The value associated with 'transcript_id' is a list of all transcripts the read aligned to. The value associated with 'NM' is the corresponding edit distance information for each alignment. For non-mappers the transcript_id is '*' and edit distance is 100 (Make it read length). """ read_info_map = collections.defaultdict(dict) # The NM tag in the SAM file tells us the edit distance for the alignment. # This pattern extracts that number. num_mismatches_pattern = re.compile('(NM:i:)(\d+)') with open(in_align_filename, 'r') as infile: for line in infile: if line.startswith('@'): continue # read forward and reverse read forward = line reverse = next(infile) # Parse the fields for the forward read into an array fwd_fields = forward.rstrip('\n').split('\t') # Parse the fields for the reverse read into an array rev_fields = reverse.rstrip('\n').split('\t') fwd_transcript_id = fwd_fields[2].split(':')[0] rev_transcript_id = rev_fields[2].split(':')[0] # This means both forward and reverse reads are non-mappers # So store 'transcript_id' as '*' and 'NM' as 2*read_length if fwd_transcript_id == '*' and rev_transcript_id == '*': read_info_map[fwd_fields[0]]['transcript_id'] = '*' read_info_map[fwd_fields[0]]['NM'] = 200 continue # Get transcript_id for mapped reads elif fwd_transcript_id == rev_transcript_id: transcript_id = fwd_transcript_id else: transcript_id = (fwd_transcript_id + rev_transcript_id).replace('*','') # This probably means the transcript was not in our master list of all transcript models # (the geneinfo filename). So we skip it. Really this should not happen # but just in case. if not self.transcript_gene_map.get(transcript_id): continue # Obtain the edit distance information for the forward read fwd_NM_match = re.search(num_mismatches_pattern, forward) rev_NM_match = re.search(num_mismatches_pattern, reverse) if fwd_NM_match and rev_NM_match: fwd_NM_count = int(fwd_NM_match.group(2)) rev_NM_count = int(rev_NM_match.group(2)) NM_count = fwd_NM_count + rev_NM_count elif not (fwd_NM_match and rev_NM_match): NM_count = 200 else: NM_count = 100 # Update read_info dictionary with transcript_id and corresponding edit distance read_info_map[fwd_fields[0]]['transcript_id'] = transcript_id read_info_map[fwd_fields[0]]['NM'] = NM_count return read_info_map
[docs] def quantify_allelic_imbalance(self): """ This is the main step which quantifies allelic imbalance for all genes in the annotation based on the aligned files for parents 1 and 2. """ self.create_transcript_gene_map() # Create read info dictionaries for each parent read_info_1 = self.read_info(self.align_filename_1) read_info_2 = self.read_info(self.align_filename_2) reads_to_ignore = self.reads_to_ignore() read_ids_1 = set(read_info_1.keys()).difference(reads_to_ignore) read_ids_2 = set(read_info_2.keys()).difference(reads_to_ignore) read_ids = read_ids_1.intersection(read_ids_2) read_ids_1_u = read_ids_1.difference(read_ids) read_ids_2_u = read_ids_2.difference(read_ids) for read in read_ids_1_u: transcript = read_info_1[read]['transcript_id'] gene = self.transcript_gene_map[transcript] self.gene_final_count[gene]['1'] += 1 for read in read_ids_2_u: transcript = read_info_2[read]['transcript_id'] gene = self.transcript_gene_map[transcript] self.gene_final_count[gene]['2'] += 1 for read in read_ids: # Transcripts to which the read mapped for each parent transcript_1 = read_info_1[read]['transcript_id'] transcript_2 = read_info_2[read]['transcript_id'] # The read did not map to any transcript in either parent if transcript_1 == '*' and transcript_2 == '*': continue # The read mapped to atleast one transcript in each parent elif transcript_1 != '*' and transcript_2 != '*': # Get the genes in parent 1 to which the read mapped gene_1 = self.transcript_gene_map[transcript_1] NM_count_1 = read_info_1[read]['NM'] # Get the genes in parent 2 to which the read mapped gene_2 = self.transcript_gene_map[transcript_2] NM_count_2 = read_info_2[read]['NM'] # Amongst the genes to which the read mapped, # there is exactly one gene in common between parent 1 and 2. if gene_1 == gene_2: # Minimum edit distance for the mapping to the gene is the same in # parent 1 and parent 2. So increment counts of both alleles of the genes by 0.5 if NM_count_1 == NM_count_2: self.gene_final_count[gene_1]['1'] += 0.5 self.gene_final_count[gene_1]['2'] += 0.5 # Minimum edit distance for the mapping to the gene is less in parent 1. # So increment count of allele of gene corresponding to parent 1. elif NM_count_1 < NM_count_2: self.gene_final_count[gene_1]['1'] += 1 # Minimum edit distance for the mapping to the gene is less in parent 2. # So increment count of allele of gene corresponding to parent 2. else: self.gene_final_count[gene_1]['2'] += 1 # The read is a non-mapper for the parent 1 transcriptome elif transcript_1 == '*': # Get the genes in parent 2 to which the read mapped gene_2 = self.transcript_gene_map[transcript_2] self.gene_final_count[gene_2]['2'] += 1 # The read is a non-mapper for the parent 2 transcriptome elif transcript_2 == '*': gene_1 = self.transcript_gene_map[transcript_1] self.gene_final_count[gene_1]['1'] += 1
#self.gene_final_count = collections.OrderedDict(sorted(self.gene_final_count.items())) def make_allele_imbalance_dist_file(self): genelist_1 = [] with open(self.geneinfo_filename_1, 'r') as geneinfo_file_1: for line in geneinfo_file_1: if line.startswith('#'): continue fields = line.strip('\n').split('\t') genelist_1.append(fields[8]) genelist_2 = [] with open(self.geneinfo_filename_2, 'r') as geneinfo_file_2: for line in geneinfo_file_2: if line.startswith('#'): continue fields = line.strip('\n').split('\t') genelist_2.append(fields[8]) exclusive_genes = list(set(genelist_1).difference(set(genelist_2))) # Write the allelic imbalance quantification information to allele imbalance dist filename with open(self.allele_imbalance_dist_filename, 'w') as allele_imbalance_dist_file: allele_imbalance_dist_file.write('#gene_id' + '\t' + '_1' + '\t' + '_2' + '\n') #for key, value in list(self.gene_final_count.items()): for gene_id in sorted(set(self.transcript_gene_map.values())): if gene_id in exclusive_genes: allele_imbalance_dist_file.write(str(gene_id) + '\t' + str(1.0) + '\t' + str(0.0) + '\n') continue if gene_id == "*": continue read_count_1 = self.gene_final_count[gene_id]['1'] read_count_2 = self.gene_final_count[gene_id]['2'] gene_read_count = read_count_1 + read_count_2 if gene_read_count == 0: allele_imbalance_dist_file.write(str(gene_id) + '\t' + str(0.5) + '\t' + str(0.5) + '\n') else: allele_imbalance_dist_file.write(str(gene_id) + '\t' + str(read_count_1/gene_read_count) + '\t' +\ str(read_count_2/gene_read_count) + '\n') @staticmethod def is_output_valid(job_arguments): # TODO return True
[docs] @staticmethod def main(): """ Entry point into script. Parses the argument list to obtain all the files needed and feeds them to the class constructor. Calls the appropriate scripts thereafter. """ parser = argparse.ArgumentParser(description='Quantifier') parser.add_argument('-d', '--sample_directory') parser.add_argument("--sample", help="Serialized Sample object containing FASTQ file info.") args = parser.parse_args() sample = eval(args.sample) allelic_imbalance_quant = AllelicImbalanceQuantificationStep(args.sample_director, sample) allelic_imbalance_quant.quantify_allelic_imbalance() allelic_imbalance_quant.make_allele_imbalance_dist_file()
if __name__ == "__main__": sys.exit(AllelicImbalanceQuantificationStep.main()) # Example command # python allelic_imbalance_quant.py -d 'sampleA'