Source code for camparee.update_annotation_for_genome

import argparse
import sys
import os
import collections
import bisect
from timeit import default_timer as timer

from beers_utils.sample import Sample
from camparee.camparee_utils import CampareeUtils
from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS

[docs]class UpdateAnnotationForGenomeStep(AbstractCampareeStep):
    """Updates a gene annotation's coordinates to account for insertions &
    deletions (indels) introduced by GenomeFilesPreparation when it creates
    variant genomes. Note, this is designed to update annotation for a single
    variant genome at a time.

    Parameters
    ----------
    genome_indel_filename : string
        Path to file containing list of indel locations generated by the
        GenomeFilesPreparation. This file has no header and contains three,
        tab-delimited colums (example):
            1:134937	D	2
            1:138813	I	1
        First column = Chromosome and coordinate of indel in the original
                       reference genome. Note, coordinate is zero-based.
        Second column = "D" if variant is a deletion, "I" if variant is an
                        insertion.
        Third column = Length in bases of indel.
    input_annot_filename : string
        Path to file containing gene/transcript annotations using coordinates
        to the original reference genome. This file should have 11, tab-delimited
        columns and includes a header (example):
            chrom  strand  txStart  txEnd exonCount  exonStarts         exonEnds           transcriptID     geneID           geneSymbol  biotype
            1      +       11869    14409 3          11869,12613,13221  12227,12721,14409  ENST00000456328  ENSG00000223972  DDX11L1     pseudogene
        An annotation file with this format can be generated from a GTF file
        using the convert_gtf_to_annot_file_format() function in the Utils
        package. A template for this annotation format is available in the
        class variable Utils.annot_output_format.
    updated_annot_filename : string
        Path to the output file containing gene/transcript annotations with
        coordinates updated to match the variant genome.
    log_filename : string
        Path to the log file.
    """

    #Name of updated annotation file generated by this script.
    UPDATE_ANNOT_OUTPUT_FILENAME_PATTERN = CAMPAREE_CONSTANTS.UPDATEANNOT_OUTPUT_FILENAME_PATTERN
    #Name of file where script logging is stored
    UPDATE_ANNOT_LOG_FILENAME_PATTERN = CAMPAREE_CONSTANTS.UPDATEANNOT_LOG_FILENAME_PATTERN

    def __init__(self, log_directory_path, data_directory_path, parameters=dict()):
        """Short summary.

        Parameters
        ----------
        data_directory_path: string
            Full path to data directory
        log_directory_path : string
            Full path to log directory.
        """
        self.data_directory_path = data_directory_path
        self.log_directory_path = log_directory_path

[docs]    def validate(self):
        return True

[docs]    def execute(self, sample, genome_indel_suffix, input_annot_file_path, chr_ploidy_file_path):
        """Main work-horse function that generates the updated annotation.

         Parameters
        ----------
        genome_indel_suffix : int
             Suffix to apply to obtain proper genome indel file. Should be 1 or 2.
        input_annot_filename : string
            Full path to annotation file with coordinates for reference genome.
        """

        # Compute which chromosomes we need to have in this annotation
        # If we are in annotation_2, we only want ones with ploidy 2 in this gender
        gender_index = 1 if sample.gender == "female" else 0
        self.chr_ploidy = self._get_chr_ploidy_from_file(chr_ploidy_file_path)
        desired_chromosomes = [chromosome for chromosome, ploidies in self.chr_ploidy.items()
                               if  ploidies[gender_index] >= int(genome_indel_suffix)]

        #TODO: Switch all of these filenames/patterns so they are stored/read
        #      from the CAMPAREE CONSTANTS.
        self.genome_indel_file_path = os.path.join(self.data_directory_path, f"sample{sample.sample_id}",
                                                   f"custom_genome_indels_{genome_indel_suffix}.txt")
        self.input_annot_file_path = input_annot_file_path
        self.updated_annot_file_path = os.path.join(self.data_directory_path, f"sample{sample.sample_id}",
                                                    self.UPDATE_ANNOT_OUTPUT_FILENAME_PATTERN.format(genome_name=genome_indel_suffix))
        self.log_file_path = os.path.join(self.log_directory_path, f'sample{sample.sample_id}',
                                          self.UPDATE_ANNOT_LOG_FILENAME_PATTERN.format(genome_name=genome_indel_suffix))


        #Load indel offsets from the indel file
        indel_offsets = UpdateAnnotationForGenomeStep._get_offsets_from_variant_file(self.genome_indel_file_path)

        with open(self.input_annot_file_path, 'r') as input_annot_file, \
                open(self.updated_annot_file_path, 'w') as updated_annot_file, \
                open(self.log_file_path, 'w') as log_file:

            #Print header for annotation file
            updated_annot_file.write("#" + CampareeUtils.annot_output_format.replace('{', '').replace('}', ''))

            current_chrom = ""

            for annot_feature in input_annot_file:

                annot_feature = annot_feature.rstrip('\n')
                line_data = annot_feature.split('\t')

                if current_chrom != line_data[0]:

                    #Skip header lines (nest in here so it's only checked when
                    #chromosomes change).
                    if annot_feature[0] == '#':
                        continue

                    current_chrom = line_data[0]
                    log_file.write(f"Processing indels and annotated features from chromosome {current_chrom}.\n")

                    if current_chrom in indel_offsets:
                        """
                        Since code below will be performing many lookups and index-
                        based references to the values and keys in current_chrom_variants,
                        it will likely be more efficient to create a list of values
                        and a list of keys from current_chrom_variants once, rather
                        than re-creating them each time the code needs to access a
                        key or value by ordered index.
                        """
                        current_chrom_variant_coords = list(indel_offsets[current_chrom].keys())
                        current_chrom_variant_offsets = list(indel_offsets[current_chrom].values())
                    else:
                        #New chromosome contains no variants
                        log_file.write(f"----No indels from chromosome {current_chrom}.\n")
                        current_chrom_variant_coords = ()
                        current_chrom_variant_coords = ()

                if current_chrom not in desired_chromosomes:
                    continue

                #Current chromosome contains variants
                if current_chrom_variant_coords:

                    tx_start = int(line_data[2])
                    tx_end = int(line_data[3])
                    #exon_count = int(line_data[4])
                    exon_starts = [int(coord) for coord in line_data[5].split(',')]
                    exon_ends = [int(coord) for coord in line_data[6].split(',')]

                    #bisect_right() finds the index at which to insert the given
                    #coordinate in sorted order. Since I'm looking for the
                    #closest coordinate <= the given coordinate, subtract 1 from
                    #the result of bisect_right() to get the correct index.
                    tx_start_offset_index = bisect.bisect_right(current_chrom_variant_coords, tx_start) - 1
                    tx_end_offset_index = bisect.bisect_right(current_chrom_variant_coords, tx_end) - 1

                    #No indels before start of current feature.
                    if tx_start_offset_index == -1:

                        updated_tx_start = tx_start

                        #No indels before end of current feature
                        if tx_end_offset_index == -1:
                            updated_tx_end = tx_end
                            updated_exon_starts = exon_starts
                            updated_exon_ends = exon_ends
                        #First indels occur before end of current feature
                        else:
                            updated_tx_end = tx_end + current_chrom_variant_offsets[tx_end_offset_index]

                            updated_exon_starts = []
                            updated_exon_ends = []
                            for coord in exon_starts:
                                ex_coord_offset_index = bisect.bisect_right(current_chrom_variant_coords, coord) - 1
                                updated_exon_coord = coord
                                if ex_coord_offset_index >= 0:
                                    updated_exon_coord += current_chrom_variant_offsets[ex_coord_offset_index]
                                updated_exon_starts.append(updated_exon_coord)
                            for coord in exon_ends:
                                ex_coord_offset_index = bisect.bisect_right(current_chrom_variant_coords, coord) - 1
                                updated_exon_coord = coord
                                if ex_coord_offset_index >= 0:
                                    updated_exon_coord += current_chrom_variant_offsets[ex_coord_offset_index]
                                updated_exon_ends.append(updated_exon_coord)
                    #No new variants between the start and stop coordinates, so
                    #apply the same offset to all coordinates in the current
                    #feature.
                    elif tx_start_offset_index == tx_end_offset_index:
                        offset = current_chrom_variant_offsets[tx_start_offset_index]
                        updated_tx_start = tx_start + offset
                        updated_tx_end = tx_end + offset
                        updated_exon_starts = [coord+offset for coord in exon_starts]
                        updated_exon_ends = [coord+offset for coord in exon_ends]
                    else:
                        updated_tx_start = tx_start + current_chrom_variant_offsets[tx_start_offset_index]
                        updated_tx_end = tx_end + current_chrom_variant_offsets[tx_end_offset_index]

                        #Update lists of exon starts/ends with correct offsets
                        updated_exon_starts = []
                        updated_exon_ends = []
                        for coord in exon_starts:
                            ex_coord_offset_index = bisect.bisect_right(current_chrom_variant_coords, coord) - 1
                            updated_exon_coord = coord + current_chrom_variant_offsets[ex_coord_offset_index]
                            updated_exon_starts.append(updated_exon_coord)
                        for coord in exon_ends:
                            ex_coord_offset_index = bisect.bisect_right(current_chrom_variant_coords, coord) - 1
                            updated_exon_coord = coord + current_chrom_variant_offsets[ex_coord_offset_index]
                            updated_exon_ends.append(updated_exon_coord)

                    #Format updated annotation data and output
                    updated_annot_file.write(
                        CampareeUtils.annot_output_format.format(
                            chrom=line_data[0],
                            strand=line_data[1],
                            txStart=updated_tx_start,
                            txEnd=updated_tx_end,
                            exonCount=line_data[4],
                            exonStarts=','.join([str(x) for x in updated_exon_starts]),
                            exonEnds=','.join([str(x) for x in updated_exon_ends]),
                            transcriptID=line_data[7],
                            geneID=line_data[8],
                            geneSymbol=line_data[9],
                            biotype=line_data[10]
                        )
                    )

                #No variants in the current chromosome, so no need to update
                #feature coordinates.
                else:
                    updated_annot_file.write(f"{annot_feature}\n")

            #Status message used by is_output_valid() method to determine if
            #this script ran to completion.
            log_file.write("\nALL DONE!")

[docs]    def get_commandline_call(self, sample, genome_indel_suffix, input_annot_file_path, chr_ploidy_file_path):
        """
        Prepare command to execute the UpdateAnnotationForGenomeStep from the
        command line, given all of the arugments used to run the execute()
        function.

        Parameters
        ----------
        sample : Sample
            Sample for which to update annotation to parental genomes
        genome_indel_suffix : string
            Suffix to apply to obtain proper genome indel file. This suffix is
            also used in the name of the updated annotation file.
        input_annot_file_path : string
            Full path to annotation file with coordinates for reference genome.
        chr_ploidy_file_path : string
            File that maps chromosome names to their male/female ploidy.

        Returns
        -------
        string
            Command to execute on the command line. It will perform the same
            operations as a call to execute() with the same parameters.

        """
        #Retrieve path to the update_annotation_for_genome.py script.
        update_annotation_path = os.path.realpath(__file__)
        #If the above command returns a string with a "pyc" extension, instead
        #of "py", strip off "c" so it points to this script.
        update_annotation_path = update_annotation_path.rstrip('c')

        command = (f" python {update_annotation_path}"
                   f" --log_directory_path {self.log_directory_path}"
                   f" --data_directory_path {self.data_directory_path}"
                   f" --sample '{repr(sample)}'"
                   f" --genome_indel_suffix {genome_indel_suffix}"
                   f" --input_annot_file_path {input_annot_file_path}"
                   f" --chr_ploidy_file_path {chr_ploidy_file_path}")

        return command

[docs]    def get_validation_attributes(self, sample, genome_indel_suffix, input_annot_file_path, chr_ploidy_file_path):
        """
        Prepare attributes required by is_output_valid() function to validate
        output generated the UpdateAnnotationForGenomeStep job corresponding to
        the given sample.

        Parameters
        ----------
        sample : Sample
            Sample for which to update annotation to parental genomes
        genome_indel_suffix : string
            Suffix to apply to obtain proper genome indel file. This suffix is
            also used in the name of the updated annotation file.
        input_annot_file_path : string
            Full path to annotation file with coordinates for reference genome.
        chr_ploidy_file_path : string
            File that maps chromosome names to their male/female ploidy.

        Returns
        -------
        dict
            A UpdateAnnotationForGenomeStep job's data_directory, log_directory,
            sample_id, and the suffix used when building the updated genome
            sequence.

        """
        validation_attributes = {}
        validation_attributes['data_directory'] = self.data_directory_path
        validation_attributes['log_directory'] = self.log_directory_path
        validation_attributes['sample_id'] = sample.sample_id
        validation_attributes['genome_name'] = genome_indel_suffix
        return validation_attributes

    @staticmethod
    def _get_chr_ploidy_from_file(chr_ploidy_filename):
        #TODO: Check to see if this is redundant with CampareeUtils.create_chr_ploidy_data().
        chr_ploidy = dict()
        with open(chr_ploidy_filename) as chr_ploidy_file:
            chr_ploidy_file.readline() # Header line
            for line in chr_ploidy_file:
                chrom, male, female = line.strip().split("\t")
                chr_ploidy[chrom] = (int(male), int(female))
        return chr_ploidy

    @staticmethod
    def _get_offsets_from_variant_file(genome_indel_filename):
        """Read indel file, calculate rolling offset at each variant position
        and return results as a dictionary of ordered dictionaries, indexed by
        chromoeomse name.

        This method requires the genome_indel_filename attribute is set and
        contains a valid filename.

        Parameters
        ----------
        genome_indel_filename : string
            Full path to indel file generated by GenomeFilesPreparation. This
            file must be sorted by chromosome (any sorting order) and by indel
            coordinate (numerical order).

        Returns
        -------
        OrderedDict nested in defaultdict
            Ordered collection of rolling offsets for each chromosome.
            For outer defaultdict:
                Key = chromosome/contig name from indel file
                Value = OrderedDict (see below)
            For inner OrderedDict:
                Key = chromosomal coordinate of variant position
                Value = rolling offest at variant position
            So variant_offsets["chr1"][12345] stores the rolling offset at
            position 12345 on chromosome 1.

        """

        with open(genome_indel_filename, 'r') as genome_indel_file:

            """
            By using the defaultdict object, I can specify the default value
            used every time I create a new key. This way, I don't need to
            include code to check and instantiate keys with empty OrderedDict
            objects. It's all handed by the defaultdict
            """
            variant_offsets = collections.defaultdict(collections.OrderedDict)
            rolling_offset = 0
            curr_chrom = ""

            for line in genome_indel_file:
                line_data = line.split('\t')
                indel_chrom, indel_position = line_data[0].split(':')
                indel_position = int(indel_position)
                indel_type = line_data[1]
                indel_offset = int(line_data[2])

                #Reset rolling offset for new chromosome
                if curr_chrom != indel_chrom:
                    rolling_offset = 0
                    curr_chrom = indel_chrom

                #If variant is a deletion, make offset negative so it subtracts
                #from the rolling offset.
                if indel_type == 'D':
                    indel_offset *= -1

                rolling_offset += indel_offset
                variant_offsets[indel_chrom][indel_position] = rolling_offset

            return variant_offsets

[docs]    @staticmethod
    def is_output_valid(validation_attributes):
        """
        Check if output of UpdateAnnotationForGenomeStep for a specific job/
        execution is correctly formed and valid, given a job's data directory,
        log directory, and sample id. Prepare these attributes for a given
        sample's jobs using the get_validation_attributes() method.

        Parameters
        ----------
        validation_attributes : dict
            A job's data_directory, log_directory, sample_id, and the suffix used
            when building the updated genome sequence.

        Returns
        -------
        boolean
            True  - UpdateAnnotationForGenomeStep output files were created and
                    are well formed.
            False - UpdateAnnotationForGenomeStep output files do not exist or
                    are missing data.

        """
        data_directory = validation_attributes['data_directory']
        log_directory = validation_attributes['log_directory']
        sample_id = validation_attributes['sample_id']
        genome_name = validation_attributes['genome_name']

        valid_output = False

        update_annot_outfile_path = os.path.join(data_directory, f"sample{sample_id}",
                                                 UpdateAnnotationForGenomeStep.UPDATE_ANNOT_OUTPUT_FILENAME_PATTERN.format(genome_name=genome_name))
        update_annot_logfile_path = os.path.join(log_directory, f"sample{sample_id}",
                                                 UpdateAnnotationForGenomeStep.UPDATE_ANNOT_LOG_FILENAME_PATTERN.format(genome_name=genome_name))

        if os.path.isfile(update_annot_outfile_path) and \
           os.path.isfile(update_annot_logfile_path):
            #Read last line in update_annotation_for_genome log file
            line = ""
            with open(update_annot_logfile_path, "r") as update_annot_log_file:
                for line in update_annot_log_file:
                    line = line.rstrip()
            if line == "ALL DONE!":
                valid_output = True

        return valid_output

[docs]    @staticmethod
    def main():
        """Entry point into script when called directly.

        Parses arguments, gathers input and output filenames, and calls scripts
        that perform the actual operation.

        """
        parser = argparse.ArgumentParser(description='Update annotation file with'
                                                     ' coordinates for variant genome')
        parser.add_argument('-l', '--log_directory_path', required=True,
                            help="Path to log directory.")
        parser.add_argument('-d', '--data_directory_path', required=True,
                            help='Path to data directory')
        parser.add_argument('-g', '--genome_indel_suffix', required=True, type=int, choices=[1,2],
                            help="Integer suffix that distinguishes genome names")
        parser.add_argument('-i', '--input_annot_file_path', required=True,
                            help="Annotation file using reference coordinates")
        parser.add_argument('-p', '--chr_ploidy_file_path')
        parser.add_argument('--sample', default=None,
                            help='String representation of a Sample object. Must provide '
                                 'this argument or the "--sample_id".')
        parser.add_argument('-s', '--sample_id', type=int, default=None,
                            help='sample name in vcf when prepended with sample. Overrides'
                                 ' id from the "--sample" argument, if it is provided.')

        args = parser.parse_args()

        update_annotation = UpdateAnnotationForGenomeStep(args.log_directory_path,
                                                          args.data_directory_path)
        if args.sample:
            sample = eval(args.sample)
        else:
            #Create dummy sample for debug purposes.
            sample = Sample(None, "debug sample", None, None, None)

        #Update sample with sample_id, if specified.
        if args.sample_id:
            sample.sample_id = args.sample_id

        update_annotation.execute(sample=sample,
                                  genome_indel_suffix=args.genome_indel_suffix,
                                  input_annot_file_path=args.input_annot_file_path,
                                  chr_ploidy_file_path=args.chr_ploidy_file_path)

if __name__ == '__main__':
    sys.exit(UpdateAnnotationForGenomeStep.main())

'''
python update_annotation_for_genome.py \
-i '../../resources/index_files/GRCh38/Homo_sapiens.GRCh38.92.annotation.sorted_to_match_fasta.txt' \
-s 1 \
-g 1 \
-d ../../data/pipeline_results_run99/expression_pipeline/data \
-l ../../data/pipeline_results_run99/expression_pipeline/logs
'''
Source code for camparee.update_annotation_for_genome

CAMPAREE

Navigation

Related Topics