Source code for camparee.transcriptome_fasta_preparation

import argparse
import re
import sys
import os

from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS

[docs]class TranscriptomeFastaPreparationStep(AbstractCampareeStep): """Produces a transcriptome FASTA file, given a genome FASTA file, a file containing exon locations, and an annotation file. Additionally, any line in the annotation file related to a chromosome not available in the genome fasta file is discarded in a new, trimmed version of the annotation file. The object is constructed with 2 input file sources (genome fasta, annotation) and 2 output file sources (trimmed annotation, transcriptome fasta). Additionally another output file, named like the genome fasta file but suffixed with '_edited' contains a munged version of the genome fasta file where each chromosome sequence occupies one line. """ Name = "Transcriptome FASTA Preparation Step" def __init__(self, log_directory_path, data_directory_path, parameters = dict()): """Constructor for TranscriptomeFastaPreparationStep object. Parameters ---------- data_directory_path: string Full path to data directory log_directory_path : string Full path to log directory. parameters : dict Dictionary of other parameters specified by the config file. This parameter is not used by this class and is retained for uniformity with all other CAMPAREE steps. """ self.data_directory_path = data_directory_path self.log_directory_path = log_directory_path # TODO: Could probably change to a class variable, or a constant contained in another class. # Provides the regex pattern for extracting components of the exon location string self.exon_info_pattern = re.compile(r'(.*):(\d+)-(\d+)')
[docs] def validate(self): return True
[docs] def execute(self, sample_id, genome_suffix, genome_fasta_file_path, annotation_file_path, include_suffix_w_tx_id=False): """Main work-horse function that does the work of creating a transcriptome fasta file from the provided inputs. Parameters ---------- sample_id : string Identifier for sample corresponding to this reference genome. Used to construct output and log paths for this specific execution. genome_suffix : string Suffix to identify the parent/allele of the source genome. Should be 1 or 2. This same suffix is a appended to all output files, and the individual transcript IDs in the transcriptome FASTA if the include_suffix_w_tx_id parameter is set to TRUE. genome_fasta_file_path : string Input genome fasta filename containing all the chromosomes of interest. No line breaks are allowed within the chromosome sequence. This is generally prepared by the GenomeBuilderStep, in which case it should have no line breaks within the chromosome sequence. annotation_file_path : string Input transcript annotation file - fields are (chromosome, strand, start, end, exon count, exon starts, exon ends, transcript ID, etc.). This is generally prepared by the UpdateAnnotationForGenomeStep. include_suffix_w_tx_id : boolean Append parent/allele suffix to transcript names in FASTA headers of the output file when set to True [Default: False]. """ self.sample_id = sample_id self.genome_suffix = genome_suffix self.genome_fasta_file_path = genome_fasta_file_path self.edited_genome_fasta_file_path = os.path.splitext(genome_fasta_file_path)[0] + "_edited.fa" self.annotation_file_path = annotation_file_path self.trimmed_annotation_file_path = os.path.splitext(annotation_file_path)[0] + "_trimmed.txt" self.include_suffix_w_tx_id = include_suffix_w_tx_id self.transcriptome_fasta_file_path = os.path.join(self.data_directory_path, f'sample{self.sample_id}', CAMPAREE_CONSTANTS.TRANSCRIPTOME_FASTA_OUTPUT_FILENAME_PATTERN.format(genome_name=genome_suffix)) self.log_file_path = os.path.join(self.log_directory_path, f'sample{self.sample_id}', CAMPAREE_CONSTANTS.TRANSCRIPTOME_FASTA_LOG_FILENAME_PATTERN.format(genome_name=genome_suffix)) # Holds unique listing of exon locations self.exon_location_list = set() # Dictionaries to record whether a chromosome is available in the genome file, is available in the exon # file. self.chromosome_in_genome_file = dict() self.chromosome_in_exon_file = dict() # Since the genes fasta file will be open for appending, we need to insure that the file doesn't # currently exist. try: os.remove(self.transcriptome_fasta_file_path) except OSError: pass with open(self.log_file_path, 'w') as log_file: # THIS IS NOT CURRENTLY NECESSARY, AS THE PRECEDING STEPS CREATE A GENOME # FASTA IN THE APPROPRIATE FORMAT. ALTERNATIVELY, A FUNCTION THAT PERFORMS # THESE OPERATIONS IS AVAILABLE IN CampareeUtils.edit_reference_genome(). # THIS ALSO HAS THE ADDED PROBLEM OF CREATING COPIES OF BOTH PARENTAL # REFERENCE GENOMES. # NOTE: THE FUNCTION HERE PROCESSES THE FASTA FILE AS IT'S READ AND DOES # NOT NEED TO STORE THE ENTIRE FILE IN MEMORY. WE SHOULD UPDATE THE # FUNCTION IN CampareeUtils, TO OPERATE LIKE THIS AS WELL. # Munge the original genome fasta file to create an edited version in which the chromosome sequence has # no internal line breaks and where all bases are in upper case. self.scrub_genome_fasta_file() log_file.write(f"done scrubbing genome fasta file\n") # Create a unique listing of exon locations from the annotation file data. self.create_exon_location_list() log_file.write(f"done identifying unique set of exon locations\n") log_file.write(f"Assembling sequences for each transcript by chromosome\n") # Open the genome fasta file for reading only. with open(self.edited_genome_fasta_file_path, 'r') as genome_fasta_file: # Iterate over each chromosome in the genome fasta file. Note that # the chromosome sequence is expected on only one line at this stage. for line in genome_fasta_file: # Remove the leading '>' character to leave the chromosome chromosome = line.lstrip('>').rstrip('\n') # Note that the chromosome 'chromosome' is among those listed in # the genome file self.chromosome_in_genome_file[chromosome] = True # Collect the chromosome sequence from the following line sequence = genome_fasta_file.readline().rstrip('\n') # Use the chromosome and its sequence to construct a mapping of # exon location string to their sequences. exon_sequence_map = self.create_exon_sequence_map(chromosome, sequence) log_file.write(f"\tdone with exons for {chromosome}\n") # Generate the transcript fasta file using the genome chromosome # and the related exon sequence map. self.make_tx_fasta_file(chromosome, exon_sequence_map) log_file.write(f"\tdone with transcripts for {chromosome}\n") # Finally create a trimmed annotation file with any transcripts unrelated # to the given genome chromosomes provided, discarded. # This method saves messages to the log file internally, so it's outside # of the with block above to avoid opening the log file from multiple # sources. self.trim_annotation_file() #Final entry in log file to indicate execution() method finished. with open(self.log_file_path, 'a') as log_file: log_file.write('\nALL DONE!\n')
[docs] def scrub_genome_fasta_file(self): """ Edits the genome fasta file, creating an edited version (genome fasta filename without extension + _edited.fa). Edits include: 1. Removing suplemmental information from the description line 2. Removing internal newlines in the sequence 3. Insuring all bases in sequence are represented in upper case. This edited file is the one used in subsequent scripts. """ # A flag to denote when a fasta sequence is being processed in_sequence = False # Open the original genome fasta file for reading only and a new edited version of the genome fasta # file for writing only. with open(self.genome_fasta_file_path, 'r') as genome_fasta_file, \ open(self.edited_genome_fasta_file_path, 'w') as edited_genome_fasta_file: # Iterate over the lines in the original genome fasta file for line in genome_fasta_file: # Identify whether the current line is a description line or a sequence line if line.startswith('>'): # For a description line, remove any supplemental information following the identifier and # if the in_sequence flag is raised, lower it and add a line break to the new genome fasta # file before adding the modified description line. identifier_only = re.sub(r'[ \t].*', '', line) if in_sequence: edited_genome_fasta_file.write("\n") in_sequence = False edited_genome_fasta_file.write(identifier_only) # Otherwise, add the sequence to the new genome fasta file after removing the line break and # insuring all bases are in upper case. Also raise the in sequence flag. else: edited_genome_fasta_file.write(line.rstrip('\n').upper()) in_sequence = True # Finally add a line break to the end of the new genome fasta file. edited_genome_fasta_file.write("\n")
[docs] def create_exon_location_list(self): """ Generate a unique listing of exon location strings from the provided annotation file. Note that the same exon may appear in multiple transcripts. So the listing is actually a set to avoid duplicate entries. """ # Open the annotation file for reading only with open(self.annotation_file_path, 'r') as annotation_file: # Iterate over each line in the file for line in annotation_file: if line.startswith("#"): #Comment line continue # Collect all the field values for the line read following newline removal. (chromosome, strand, start, end, exon_count, exon_starts, exon_ends, name, *other) = \ line.rstrip('\n').split('\t') # Remove any trailing commas in the exon starts and exon ends fields # and split the starts and stops into their corresponding lists exon_starts_list = re.sub(r'\s*,\s*$', '', exon_starts).split(",") exon_ends_list = re.sub(r'\s*,\s*$', '', exon_ends).split(",") # For each exon belonging to the transcript, construct the exon's # location string and add it to the growing set of exon locations # assuming it is not already present. for index in range(int(exon_count)): exon_location = f'{chromosome}:{(int(exon_starts_list[index]) + 1)}-{(exon_ends_list[index])}' self.exon_location_list.add(exon_location)
[docs] def trim_annotation_file(self): """ Create a trimmed annotation file in which lines related to chromosomes that are not present in the genome fasta file are expunged. If there are no such omissions, the files will be identical. """ # TODO: rather than always create this file, it might be better if it's # only created when missing_genome_chromosomes is non-empty. Then # any future steps that come looking for this file will check to # see if the edited one exists, and load the original annotation if # it does not. # Holds a list of chromosomes found for exons in the annotation file that are not available in the # genome fasta file missing_genome_chromosomes = [] with open(self.log_file_path, 'a') as log_file: log_file.write(f"Identifying chromosomes in the annotation with no corresponding genome sequence.\n") # Iterate over all the chromosomes found for exons in the annotation file for chromosome in self.chromosome_in_exon_file.keys(): # If that chromosome is not also in the genome fasta file, issue a warning and add the # unavailable chromosome to the list of missing chromosomes. Otherwise, note the # chromosome as available. if chromosome not in self.chromosome_in_genome_file: log_file.write(f"\tno genome sequence for {chromosome}\n") missing_genome_chromosomes.append(chromosome) else: log_file.write(f"\tsequence available for {chromosome}\n") # If there are missing chromosomes, note that fact. if missing_genome_chromosomes: log_file.write(f"Removing the transcripts on chromosomes for which no genome sequences are available.\n") # Open the original annotation for reading and the edited annotation file for writing. with open(self.annotation_file_path, 'r') as annotation_in, \ open(self.trimmed_annotation_file_path, 'w') as annotation_out: # Iterate over the lines in the original annotation file for line in annotation_in: # If none of the missing chromosomes are found on the line, write # the line out to the trimmed version of the annotation file. if not any(chromosome in line for chromosome in missing_genome_chromosomes): annotation_out.write(line)
[docs] def create_exon_sequence_map(self, genome_chromosome, sequence): """ For the given genome chromosome and its sequence, create a dictionary of exon sequences keyed to the exon's location (i.e., chr:start-end). :param genome_chromosome: given genome chromosome :param sequence: the genome sequence corresponding to the genome chromosome (without line breaks) :return: map of exon location : exon sequence """ # Start with a empty dictionary exon_sequence_map = dict() # Iterate over the list of previously obtained exon locations for exon_location in self.exon_location_list: # Extract the chromosome, start and end from the exon location string exon_info_match = re.search(self.exon_info_pattern, exon_location) chromosome = exon_info_match.group(1) exon_start = int(exon_info_match.group(2)) exon_end = int(exon_info_match.group(3)) # Note that the exon listing contains at least one exon on chromosome 'chromosome' self.chromosome_in_exon_file[chromosome] = True # If the chromosome on which the exon is located is the same of the genome chromosome # provided as a parameter, get the sequence for that exon and create a dictionary # entry relating the exon location string to the exon sequence. if chromosome == genome_chromosome: exon_sequence_map[exon_location] = sequence[exon_start-1:exon_end] # Return the dictionary of exon location string : exon sequence for the genome chromosome # provided in the parameter list. return exon_sequence_map
def make_tx_fasta_file(self, genome_chromosome, exon_sequence_map): # Open the original annotation file for reading and the genes fasta file for appending. with open(self.annotation_file_path, 'r') as annotation_file, \ open(self.transcriptome_fasta_file_path, 'a') as transcriptome_fasta_file: # Iterate over the annotation file for line in annotation_file: if line.startswith("#"): # Comment line continue # Collect all the field values for the line read following newline removal. (chromosome, strand, start, end, exon_count, exon_starts, exon_ends, feature_id, *other) =\ line.rstrip('\n').split('\t') # Remove trailing commas in the exon starts and exon ends fields and split # the starts and stops into their corresponding lists exon_starts_list = re.sub(r'\s*,\s*$', '', exon_starts).split(",") exon_ends_list = re.sub(r'\s*,\s*$', '', exon_ends).split(",") # If the chromosome on which this transcript is located is the # same of the genome chromosome provided as a parameter, render # this transcript as an entry in the genes fasta file if chromosome == genome_chromosome: # Initialize the transcripts's sequence tx_sequence = "" # For each exon belonging to the transcript, construct the # exon's location string and use it as a key to obtain the # actual exon sequence. Concatenate that exon sequence to # the transcript sequence. Note that the 1 added to each # exon start takes into account the zero based and half- # open ucsc coordinates. for index in range(int(exon_count)): exon_key = f'{chromosome}:{(int(exon_starts_list[index]) + 1)}-{(exon_ends_list[index])}' exon_sequence = exon_sequence_map[exon_key] tx_sequence += exon_sequence # Remove some spurious characters from the transcript ID tx_id = re.sub(r'::::.*', '', feature_id) # TODO determine if this substitution is still needed. tx_id = re.sub(r'\([^(]+$', '', tx_id) if self.include_suffix_w_tx_id: tx_id = tx_id + "_" + self.genome_suffix # Write the 1st line of the fasta entry - transcript location string transcriptome_fasta_file.write(f'>{tx_id}:{chromosome}:{start}-{end}_{strand}\n') # Write the 2nd line of the fasta entry - gene sequence. transcriptome_fasta_file.write(tx_sequence + '\n')
[docs] def get_commandline_call(self, sample_id, genome_suffix, genome_fasta_file_path, annotation_file_path, include_suffix_w_tx_id=False): """ Prepare command to execute the TranscriptomeFastaPreparationStep from the command line, given all of the arugments used to run the execute() function. Parameters ---------- sample_id : string Identifier for sample corresponding to this reference genome. Used to construct output and log paths for this specific execution. genome_suffix : string Suffix to identify the parent/allele of the source genome. Should be 1 or 2. This same suffix is a appended to all output files, and the individual transcript IDs in the transcriptome FASTA if the include_suffix_w_tx_id parameter is set to TRUE. genome_fasta_file_path : string Input genome fasta filename containing all the chromosomes of interest. No line breaks are allowed within the chromosome sequence. annotation_file_path : string Input information about the transcripts - fields are (chromosome, strand, start, end, exon count, exon starts, exon ends, transcript ID, gene ID, gene symbol) include_suffix_w_tx_id : boolean Append parent/allele suffix to transcript names in FASTA headers of the output file when set to True [Default: False]. Returns ------- string Command to execute on the command line. It will perform the same operations as a call to execute() with the same parameters. """ #Retrieve path to the gene_files_preparation.py script. txptome_fasta_prep_path = os.path.realpath(__file__) #If the above command returns a string with a "pyc" extension, instead #of "py", strip off "c" so it points to this script. txptome_fasta_prep_path = txptome_fasta_prep_path.rstrip('c') command = (f" python {txptome_fasta_prep_path}" f" --log_directory_path {self.log_directory_path}" f" --data_directory_path {self.data_directory_path}" f" --sample_id {sample_id}" f" --genome_suffix {genome_suffix}" f" --genome_fasta_file_path {genome_fasta_file_path}" f" --annotation_file_path {annotation_file_path}") if include_suffix_w_tx_id: command += f" --add_suffix" return command
[docs] def get_validation_attributes(self, sample_id, genome_suffix, genome_fasta_file_path, annotation_file_path, include_suffix_w_tx_id=False): """ Prepare attributes required by is_output_valid() function to validate output generated the TranscriptomeFastaPreparationStep job corresponding to the given input files. Parameters ---------- sample_id : string Identifier for sample corresponding to this reference genome. Used to construct output and log paths for this specific execution. genome_suffix : string Suffix to identify the parent/allele of the source genome. Should be 1 or 2. This same suffix is a appended to all output files, and the individual transcript IDs in the transcriptome FASTA if the include_suffix_w_tx_id parameter is set to TRUE. genome_fasta_file_path : string Input genome fasta filename containing all the chromosomes of interest. No line breaks are allowed within the chromosome sequence. annotation_file_path : string Input information about the transcripts - fields are (chromosome, strand, start, end, exon count, exon starts, exon ends, transcript ID, gene ID, gene symbol) include_suffix_w_tx_id : boolean Append parent/allele suffix to transcript names in FASTA headers of the output file when set to True [Default: False]. Returns ------- dict A TranscriptomeFastaPreparationStep job's data_directory, log_directory, input genome_fasta_file_path, input annotation_file_path, and output transcriptome_fasta_file_path used when creating the transcriptome FASTA files. """ validation_attributes = {} validation_attributes['data_directory'] = self.data_directory_path validation_attributes['log_directory'] = self.log_directory_path validation_attributes['sample_id'] = sample_id validation_attributes['genome_suffix'] = genome_suffix validation_attributes['genome_fasta_file_path'] = genome_fasta_file_path validation_attributes['annotation_file_path'] = annotation_file_path return validation_attributes
[docs] @staticmethod def is_output_valid(validation_attributes): """ Check if output of TranscriptomeFastaPreparationStep for a specific job/ execution is correctly formed and valid, given a job's data directory, log directory, input genome FASTA filename, input annotation file, and output transcriptome FASTA filename. Prepare these attributes for a given jobs using the get_validation_attributes() method. Parameters ---------- validation_attributes : dict A job's data_directory, log_directory, input genome_fasta_file_path, input annotation_file_path, and output transcriptome_fasta_file_path used when creating the transcriptome FASTA files. Returns ------- boolean True - TranscriptomeFastaPreparationStep output files were created and are well formed. False - TranscriptomeFastaPreparationStep output files do not exist or are missing data. """ data_directory = validation_attributes['data_directory'] log_directory_path = validation_attributes['log_directory'] sample_id = validation_attributes['sample_id'] genome_suffix = validation_attributes['genome_suffix'] genome_fasta_file_path = validation_attributes['genome_fasta_file_path'] annotation_file_path = validation_attributes['annotation_file_path'] valid_output = False # Construct output filenames edited_genome_fasta_file_path = os.path.splitext(genome_fasta_file_path)[0] + "_edited.fa" trimmed_annotation_file_path = os.path.splitext(annotation_file_path)[0] + "_trimmed.txt" transcriptome_fasta_file_path = os.path.join(data_directory, f'sample{sample_id}', CAMPAREE_CONSTANTS.TRANSCRIPTOME_FASTA_OUTPUT_FILENAME_PATTERN.format(genome_name=genome_suffix)) log_file_path = os.path.join(log_directory_path, f'sample{sample_id}', CAMPAREE_CONSTANTS.TRANSCRIPTOME_FASTA_LOG_FILENAME_PATTERN.format(genome_name=genome_suffix)) if os.path.isfile(edited_genome_fasta_file_path) and \ os.path.isfile(trimmed_annotation_file_path) and \ os.path.isfile(transcriptome_fasta_file_path) and \ os.path.isfile(log_file_path): #Read last line in log file line = "" with open(log_file_path, "r") as log_file: for line in log_file: line = line.rstrip() if line == "ALL DONE!": valid_output = True return valid_output
[docs] @staticmethod def main(): """Entry point into script when called directly. Parses arguments, gathers input and output filenames, and calls methods that perform the actual operation. """ parser = argparse.ArgumentParser(description='Create transcriptome FASTA' ' from genome sequence and annotation.') parser.add_argument('-l', '--log_directory_path', required=True, help="Path to log directory.") parser.add_argument('-d', '--data_directory_path', required=True, help='Path to data directory') parser.add_argument('--sample_id', required=True, help='Sample ID associated with input genome.') parser.add_argument('--genome_suffix', required=True, help='Suffix identifying parent/allele of source genome.') parser.add_argument('-g', '--genome_fasta_file_path', required=True, help='Input genome FASTA filename, with no linebreaks ' 'within a given chromosome\'s sequence') parser.add_argument('-a', '--annotation_file_path', required=True, help='Annotation file using coordinates matching ' 'the input genome FASTA file.') parser.add_argument('-s', '--add_suffix', action="store_true", help='Append genome suffix to each transcript name ' 'in FASTA headers of the output file.') args = parser.parse_args() tx_fasta_prep = TranscriptomeFastaPreparationStep(args.log_directory_path, args.data_directory_path) tx_fasta_prep.execute(sample_id=args.sample_id, genome_suffix=args.genome_suffix, genome_fasta_file_path=args.genome_fasta_file_path, annotation_file_path=args.annotation_file_path, include_suffix_w_tx_id=args.add_suffix)
if __name__ == "__main__": sys.exit(TranscriptomeFastaPreparationStep.main())