Source code for camparee.transcript_gene_quant

import argparse
import sys
import os
import collections

from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS

[docs]class TranscriptGeneQuantificationStep(AbstractCampareeStep):
    """This class takes a kallisto output file and generates transcript- and
    gene-level quantification files, and a file of PSI (Percent Spliced In)
    values for alternative spliceforms.

    """

    OUTPUT_TRANSCRIPT_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_TX_FILENAME
    OUTPUT_GENE_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_GENE_FILENAME
    OUTPUT_PSI_VALUE_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_PSI_FILENAME

    def __init__(self, log_directory_path, data_directory_path, parameter=None):
        """Constructor for TranscriptGeneQuantificationStep object.

        Parameters
        ----------
        data_directory_path: string
            Full path to data directory
        log_directory_path : string
            Full path to log directory.
        parameters : dict
            Dictionary of other parameters specified by the config file. This
            parameter is not used by this class and is retained for uniformity
            with all other CAMPAREE steps.

        """
        self.data_directory_path = data_directory_path
        self.log_directory_path = log_directory_path

[docs]    def validate(self):
        return True

[docs]    def execute(self, sample_id, tx_abundance_file_path, annotation_file_path):
        """Main work-horse function that generates transcript, gene, and PSI
        count files from transcript-level kallisto data.

        Parameters
        ----------
        sample_id : string
            Identifier for sample corresponding to the input kallisto file. Used
            to construct output and log paths for this specific execution.
        tx_abundance_file_path : string
            File of transcript abundances created by kallisto. Likely generated
            by KallistoQuantStep.
        annotation_file_path : string
            Input transcript annotation file. Used to map transcript IDs to gene
            IDs. This is generally prepared by the UpdateAnnotationForGenomeStep.

        """

        # Used in create_transcript_gene_map function, so it needs to be an
        # instance variable.
        self.annotation_file_path = annotation_file_path

        # Dictionaries to keep track of length of transcript, number of uniquely
        # mapped reads to transcript, and final count of reads mapped to transcript.
        # This procedure does not map all gene info keys used. Consequently we
        # need to insure that assignments using new keys are initialized to 0.
        transcript_final_count = collections.defaultdict(float)
        psi_value_map = collections.defaultdict(list)
        # Used in create_transcript_gene_map, so it needs to be an instance variable.
        self.transcript_gene_map = collections.defaultdict(str)

        # Prepare paths to output filenames
        transcript_count_filename = os.path.join(self.data_directory_path, f'sample{sample_id}',
                                                 self.OUTPUT_TRANSCRIPT_FILE_NAME)
        gene_count_filename = os.path.join(self.data_directory_path, f'sample{sample_id}',
                                           self.OUTPUT_GENE_FILE_NAME)
        psi_value_filename = os.path.join(self.data_directory_path, f'sample{sample_id}',
                                               self.OUTPUT_PSI_VALUE_FILE_NAME)
        log_file_path = os.path.join(self.log_directory_path, f'sample{sample_id}',
                                     CAMPAREE_CONSTANTS.TXQUANT_LOG_FILENAME)

        with open(log_file_path, "w") as log_file:

            print(f"Generate transcript, gene, and PSI value files from kallisto "
                  f"output from sample{sample_id}.")
            log_file.write("Generate transcript, gene, and PSI value files "
                           f"from kallisto output from sample{sample_id}.\n")
            log_file.write(f"Parameters:\n"
                           f"    kallisto abundance path: {tx_abundance_file_path}\n"
                           f"    annotation file path: {annotation_file_path}\n")

            print("Extracting transcript:gene mappings from annotation file.")
            log_file.write("Extracting transcript:gene mappings from annotation file.\n")
            # Transcript : parent gene dictionary
            self.create_transcript_gene_map()

            print("Extracting transcript abundances from kallisto file.")
            log_file.write("Extracting transcript abundances from kallisto file.\n")
            with open(tx_abundance_file_path, "r") as tx_abundance_file:
                for line in tx_abundance_file:
                    if line.startswith("target_id"):
                        continue
                    line = line.rstrip('\n').split('\t')
                    transcript_id = line[0].split(':')[0]
                    transcript_fpk = float(line[3]) / float(line[2]) * 1000 # est_counts / eff_length * 1000 = FPK
                    transcript_final_count[transcript_id] = transcript_fpk

            print("Writing transcript-level quants to file.")
            log_file.write("Writing transcript-level quants to file.\n")
            # Write the transcript quantification information to transcript quant filename
            with open(transcript_count_filename, 'w') as transcript_count_file:
                transcript_count_file.write('#transcript_id' + '\t' + 'cnt' + '\n')
                for key, value in transcript_final_count.items():
                    transcript_count_file.write(str(key) + '\t' + str(round(value, 3)) + '\n')

            print("Summing transcript-level quants by gene to get gene-level quants.")
            log_file.write("Summing transcript-level quants by gene to get gene-level quants.\n")
            # Add the transcript counts to parent gene to get gene count
            gene_count = collections.defaultdict(float)
            for key, value in transcript_final_count.items():
                gene_count[self.transcript_gene_map[key]] += value

            gene_count = collections.OrderedDict(sorted(gene_count.items()))

            print("Writing gene-level quants to file.")
            log_file.write("Writing gene-level quants to file.\n")
            # Write gene quantification information to gene quant filename
            with open(gene_count_filename, 'w') as gene_count_file:
                gene_count_file.write('#gene_id' + '\t' + 'cnt' + '\n')
                for key, value in gene_count.items():
                    gene_count_file.write(str(key) + '\t' + str(round(value,3)) + '\n')

            print("Deriving PSI values from gene- and transcript-level quants.")
            log_file.write("Deriving PSI values from gene- and transcript-level quants.\n")
            # Create dictionary with key gene_id and values isoforms and their psi values
            for transcript_id in transcript_final_count.keys():
                gene_id = self.transcript_gene_map[transcript_id]
                if gene_count[gene_id]:
                    psi_value_map[gene_id].append(transcript_id + ':' \
                        + str(transcript_final_count[transcript_id]/gene_count[gene_id]))
                else:
                    psi_value_map[gene_id].append(transcript_id + ':' + str(0))

            print("Writing PSI values to file.")
            log_file.write("Writing PSI values to file.\n")
            # Write psi value information for each gene
            with open(psi_value_filename, 'w') as psi_value_file:
                psi_value_file.write('#gene_id' + '\t' + 'isoform_psi_value' + '\n')
                for gene_id in psi_value_map.keys():
                    psi_value_file.write(str(gene_id) + '\t' + ','.join(psi_value_map[gene_id]) + '\n')

            log_file.write("\nALL DONE!\n")

[docs]    def create_transcript_gene_map(self):
        """Create dictionary to map transcript id to gene id using geneinfo file

        """
        with open(self.annotation_file_path, 'r') as annotation_file:
            next(annotation_file)
            for line in annotation_file:
                fields = line.strip('\n').split('\t')
                self.transcript_gene_map[fields[7]] = fields[8]

[docs]    def get_commandline_call(self, sample_id, tx_abundance_file_path, annotation_file_path):
        """
        Prepare command to execute the TranscriptGeneQuantificationStep from the
        command line, given all of the arugments used to run the execute() function.

        Parameters
        ----------
        sample_id : string
            Identifier for sample corresponding to the input kallisto file. Used
            to construct output and log paths for this specific execution.
        tx_abundance_file_path : string
            File of transcript abundances created by kallisto. Likely generated
            by KallistoQuantStep.
        annotation_file_path : string
            Input transcript annotation file. Used to map transcript IDs to gene
            IDs. This is generally prepared by the UpdateAnnotationForGenomeStep.

        Returns
        -------
        string
            Command to execute on the command line. It will perform the same
            operations as a call to execute() with the same parameters.

        """
        #Retrieve path to the transcript_gene_quant.py script.
        txquant_step_path = os.path.realpath(__file__)
        #If the above command returns a string with a "pyc" extension, instead
        #of "py", strip off "c" so it points to this script.
        txquant_step_path = txquant_step_path.rstrip('c')

        command = (f" python {txquant_step_path}"
                   f" --log_directory_path {self.log_directory_path}"
                   f" --data_directory_path {self.data_directory_path}"
                   f" --sample_id {sample_id}"
                   f" --tx_abundance_file_path {tx_abundance_file_path}"
                   f" --annotation_file_path {annotation_file_path}")

        return command

[docs]    def get_validation_attributes(self, sample_id, tx_abundance_file_path, annotation_file_path):
        """
        Prepare attributes required by is_output_valid() function to validate
        output generated by the TranscriptGeneQuantificationStep job.

        Parameters
        ----------
        sample_id : string
            Identifier for sample corresponding to the input kallisto file. Used
            to construct output and log paths for this specific execution.
        tx_abundance_file_path : string
            File of transcript abundances created by kallisto. Likely generated
            by KallistoQuantStep. [Note: this parameter is captured just so
            get_validation_attributes() accepts the same arguments as
            get_commandline_call(). It is not used here.]
        annotation_file_path : string
            Input transcript annotation file. Used to map transcript IDs to gene
            IDs. This is generally prepared by the UpdateAnnotationForGenomeStep.
            [Note: this parameter is captured just so get_validation_attributes()
            accepts the same arguments as get_commandline_call(). It is not used
            here.]

        Returns
        -------
        dict
            A TranscriptGeneQuantificationStep job's data_directory, log_directory,
            and corresponding sample ID.
        """
        validation_attributes = {}
        validation_attributes['data_directory'] = self.data_directory_path
        validation_attributes['log_directory'] = self.log_directory_path
        validation_attributes['sample_id'] = sample_id
        return validation_attributes

[docs]    @staticmethod
    def is_output_valid(validation_attributes):
        """
        Check if output of TranscriptGeneQuantificationStep for a specific
        job/execution is correctly formed and valid, given a job's data directory,
        log directory, and sample ID. Prepare these attributes for a given job
        using the get_validation_attributes() method.

        Parameters
        ----------
        validation_attributes : dict
            A job's data_directory, log_directory, and corresponding sample_id.

        Returns
        -------
        boolean
            True  - TranscriptGeneQuantificationStep output files were created
                    and are well formed.
            False - TranscriptGeneQuantificationStep output files do not exist
                    or are missing data.

        """

        data_directory_path = validation_attributes['data_directory']
        log_directory_path = validation_attributes['log_directory']
        sample_id = validation_attributes['sample_id']

        valid_output = False

        # Construct output filenames/paths
        transcript_count_filename = os.path.join(data_directory_path, f'sample{sample_id}',
                                                 TranscriptGeneQuantificationStep.OUTPUT_TRANSCRIPT_FILE_NAME)
        gene_count_filename = os.path.join(data_directory_path, f'sample{sample_id}',
                                           TranscriptGeneQuantificationStep.OUTPUT_GENE_FILE_NAME)
        psi_value_filename = os.path.join(data_directory_path, f'sample{sample_id}',
                                          TranscriptGeneQuantificationStep.OUTPUT_PSI_VALUE_FILE_NAME)
        log_file_path = os.path.join(log_directory_path, f'sample{sample_id}',
                                     CAMPAREE_CONSTANTS.TXQUANT_LOG_FILENAME)

        if os.path.isfile(transcript_count_filename) and \
           os.path.isfile(gene_count_filename) and \
           os.path.isfile(psi_value_filename) and \
           os.path.isfile(log_file_path):

            #Read last line in log file
            line = ""
            with open(log_file_path, "r") as log_file:
                for line in log_file:
                    line = line.rstrip()
            if line == "ALL DONE!":
                valid_output = True

        return valid_output

[docs]    @staticmethod
    def main():
        """
        Entry point into script. Parses the argument list to obtain all the files
        needed and feeds them to the class constructor. Calls the appropriate
        methods thereafter.

        """

        parser = argparse.ArgumentParser(description='Parse kallisto results and '
                                                     'generate transcript, gene, '
                                                     'and PSI quant files.')
        parser.add_argument('-l', '--log_directory_path', required=True,
                            help="Path to log directory.")
        parser.add_argument('-d', '--data_directory_path', required=True,
                            help='Path to data directory')
        parser.add_argument('--sample_id', required=True,
                            help='Sample ID associated with input kallisto data.')
        parser.add_argument('-k', '--tx_abundance_file_path', required=True,
                            help='Transcript-level abundance file generated by kallisto')
        parser.add_argument('-a', '--annotation_file_path', required=True,
                            help='Annotation file mapping transcript IDs to gene IDs')

        args = parser.parse_args()

        transcript_gene_quant = TranscriptGeneQuantificationStep(log_directory_path=args.log_directory_path,
                                                                 data_directory_path=args.data_directory_path)
        transcript_gene_quant.execute(sample_id=args.sample_id,
                                      tx_abundance_file_path=args.tx_abundance_file_path,
                                      annotation_file_path=args.annotation_file_path)

if __name__ == "__main__":
    sys.exit(TranscriptGeneQuantificationStep.main())
Source code for camparee.transcript_gene_quant

CAMPAREE

Navigation

Related Topics