Source code for camparee.transcript_gene_quant

import argparse
import sys
import os
import collections

from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS

[docs]class TranscriptGeneQuantificationStep(AbstractCampareeStep): """This class takes a kallisto output file and generates transcript- and gene-level quantification files, and a file of PSI (Percent Spliced In) values for alternative spliceforms. """ OUTPUT_TRANSCRIPT_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_TX_FILENAME OUTPUT_GENE_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_GENE_FILENAME OUTPUT_PSI_VALUE_FILE_NAME = CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_PSI_FILENAME def __init__(self, log_directory_path, data_directory_path, parameter=None): """Constructor for TranscriptGeneQuantificationStep object. Parameters ---------- data_directory_path: string Full path to data directory log_directory_path : string Full path to log directory. parameters : dict Dictionary of other parameters specified by the config file. This parameter is not used by this class and is retained for uniformity with all other CAMPAREE steps. """ self.data_directory_path = data_directory_path self.log_directory_path = log_directory_path
[docs] def validate(self): return True
[docs] def execute(self, sample_id, tx_abundance_file_path, annotation_file_path): """Main work-horse function that generates transcript, gene, and PSI count files from transcript-level kallisto data. Parameters ---------- sample_id : string Identifier for sample corresponding to the input kallisto file. Used to construct output and log paths for this specific execution. tx_abundance_file_path : string File of transcript abundances created by kallisto. Likely generated by KallistoQuantStep. annotation_file_path : string Input transcript annotation file. Used to map transcript IDs to gene IDs. This is generally prepared by the UpdateAnnotationForGenomeStep. """ # Used in create_transcript_gene_map function, so it needs to be an # instance variable. self.annotation_file_path = annotation_file_path # Dictionaries to keep track of length of transcript, number of uniquely # mapped reads to transcript, and final count of reads mapped to transcript. # This procedure does not map all gene info keys used. Consequently we # need to insure that assignments using new keys are initialized to 0. transcript_final_count = collections.defaultdict(float) psi_value_map = collections.defaultdict(list) # Used in create_transcript_gene_map, so it needs to be an instance variable. self.transcript_gene_map = collections.defaultdict(str) # Prepare paths to output filenames transcript_count_filename = os.path.join(self.data_directory_path, f'sample{sample_id}', self.OUTPUT_TRANSCRIPT_FILE_NAME) gene_count_filename = os.path.join(self.data_directory_path, f'sample{sample_id}', self.OUTPUT_GENE_FILE_NAME) psi_value_filename = os.path.join(self.data_directory_path, f'sample{sample_id}', self.OUTPUT_PSI_VALUE_FILE_NAME) log_file_path = os.path.join(self.log_directory_path, f'sample{sample_id}', CAMPAREE_CONSTANTS.TXQUANT_LOG_FILENAME) with open(log_file_path, "w") as log_file: print(f"Generate transcript, gene, and PSI value files from kallisto " f"output from sample{sample_id}.") log_file.write("Generate transcript, gene, and PSI value files " f"from kallisto output from sample{sample_id}.\n") log_file.write(f"Parameters:\n" f" kallisto abundance path: {tx_abundance_file_path}\n" f" annotation file path: {annotation_file_path}\n") print("Extracting transcript:gene mappings from annotation file.") log_file.write("Extracting transcript:gene mappings from annotation file.\n") # Transcript : parent gene dictionary self.create_transcript_gene_map() print("Extracting transcript abundances from kallisto file.") log_file.write("Extracting transcript abundances from kallisto file.\n") with open(tx_abundance_file_path, "r") as tx_abundance_file: for line in tx_abundance_file: if line.startswith("target_id"): continue line = line.rstrip('\n').split('\t') transcript_id = line[0].split(':')[0] transcript_fpk = float(line[3]) / float(line[2]) * 1000 # est_counts / eff_length * 1000 = FPK transcript_final_count[transcript_id] = transcript_fpk print("Writing transcript-level quants to file.") log_file.write("Writing transcript-level quants to file.\n") # Write the transcript quantification information to transcript quant filename with open(transcript_count_filename, 'w') as transcript_count_file: transcript_count_file.write('#transcript_id' + '\t' + 'cnt' + '\n') for key, value in transcript_final_count.items(): transcript_count_file.write(str(key) + '\t' + str(round(value, 3)) + '\n') print("Summing transcript-level quants by gene to get gene-level quants.") log_file.write("Summing transcript-level quants by gene to get gene-level quants.\n") # Add the transcript counts to parent gene to get gene count gene_count = collections.defaultdict(float) for key, value in transcript_final_count.items(): gene_count[self.transcript_gene_map[key]] += value gene_count = collections.OrderedDict(sorted(gene_count.items())) print("Writing gene-level quants to file.") log_file.write("Writing gene-level quants to file.\n") # Write gene quantification information to gene quant filename with open(gene_count_filename, 'w') as gene_count_file: gene_count_file.write('#gene_id' + '\t' + 'cnt' + '\n') for key, value in gene_count.items(): gene_count_file.write(str(key) + '\t' + str(round(value,3)) + '\n') print("Deriving PSI values from gene- and transcript-level quants.") log_file.write("Deriving PSI values from gene- and transcript-level quants.\n") # Create dictionary with key gene_id and values isoforms and their psi values for transcript_id in transcript_final_count.keys(): gene_id = self.transcript_gene_map[transcript_id] if gene_count[gene_id]: psi_value_map[gene_id].append(transcript_id + ':' \ + str(transcript_final_count[transcript_id]/gene_count[gene_id])) else: psi_value_map[gene_id].append(transcript_id + ':' + str(0)) print("Writing PSI values to file.") log_file.write("Writing PSI values to file.\n") # Write psi value information for each gene with open(psi_value_filename, 'w') as psi_value_file: psi_value_file.write('#gene_id' + '\t' + 'isoform_psi_value' + '\n') for gene_id in psi_value_map.keys(): psi_value_file.write(str(gene_id) + '\t' + ','.join(psi_value_map[gene_id]) + '\n') log_file.write("\nALL DONE!\n")
[docs] def create_transcript_gene_map(self): """Create dictionary to map transcript id to gene id using geneinfo file """ with open(self.annotation_file_path, 'r') as annotation_file: next(annotation_file) for line in annotation_file: fields = line.strip('\n').split('\t') self.transcript_gene_map[fields[7]] = fields[8]
[docs] def get_commandline_call(self, sample_id, tx_abundance_file_path, annotation_file_path): """ Prepare command to execute the TranscriptGeneQuantificationStep from the command line, given all of the arugments used to run the execute() function. Parameters ---------- sample_id : string Identifier for sample corresponding to the input kallisto file. Used to construct output and log paths for this specific execution. tx_abundance_file_path : string File of transcript abundances created by kallisto. Likely generated by KallistoQuantStep. annotation_file_path : string Input transcript annotation file. Used to map transcript IDs to gene IDs. This is generally prepared by the UpdateAnnotationForGenomeStep. Returns ------- string Command to execute on the command line. It will perform the same operations as a call to execute() with the same parameters. """ #Retrieve path to the transcript_gene_quant.py script. txquant_step_path = os.path.realpath(__file__) #If the above command returns a string with a "pyc" extension, instead #of "py", strip off "c" so it points to this script. txquant_step_path = txquant_step_path.rstrip('c') command = (f" python {txquant_step_path}" f" --log_directory_path {self.log_directory_path}" f" --data_directory_path {self.data_directory_path}" f" --sample_id {sample_id}" f" --tx_abundance_file_path {tx_abundance_file_path}" f" --annotation_file_path {annotation_file_path}") return command
[docs] def get_validation_attributes(self, sample_id, tx_abundance_file_path, annotation_file_path): """ Prepare attributes required by is_output_valid() function to validate output generated by the TranscriptGeneQuantificationStep job. Parameters ---------- sample_id : string Identifier for sample corresponding to the input kallisto file. Used to construct output and log paths for this specific execution. tx_abundance_file_path : string File of transcript abundances created by kallisto. Likely generated by KallistoQuantStep. [Note: this parameter is captured just so get_validation_attributes() accepts the same arguments as get_commandline_call(). It is not used here.] annotation_file_path : string Input transcript annotation file. Used to map transcript IDs to gene IDs. This is generally prepared by the UpdateAnnotationForGenomeStep. [Note: this parameter is captured just so get_validation_attributes() accepts the same arguments as get_commandline_call(). It is not used here.] Returns ------- dict A TranscriptGeneQuantificationStep job's data_directory, log_directory, and corresponding sample ID. """ validation_attributes = {} validation_attributes['data_directory'] = self.data_directory_path validation_attributes['log_directory'] = self.log_directory_path validation_attributes['sample_id'] = sample_id return validation_attributes
[docs] @staticmethod def is_output_valid(validation_attributes): """ Check if output of TranscriptGeneQuantificationStep for a specific job/execution is correctly formed and valid, given a job's data directory, log directory, and sample ID. Prepare these attributes for a given job using the get_validation_attributes() method. Parameters ---------- validation_attributes : dict A job's data_directory, log_directory, and corresponding sample_id. Returns ------- boolean True - TranscriptGeneQuantificationStep output files were created and are well formed. False - TranscriptGeneQuantificationStep output files do not exist or are missing data. """ data_directory_path = validation_attributes['data_directory'] log_directory_path = validation_attributes['log_directory'] sample_id = validation_attributes['sample_id'] valid_output = False # Construct output filenames/paths transcript_count_filename = os.path.join(data_directory_path, f'sample{sample_id}', TranscriptGeneQuantificationStep.OUTPUT_TRANSCRIPT_FILE_NAME) gene_count_filename = os.path.join(data_directory_path, f'sample{sample_id}', TranscriptGeneQuantificationStep.OUTPUT_GENE_FILE_NAME) psi_value_filename = os.path.join(data_directory_path, f'sample{sample_id}', TranscriptGeneQuantificationStep.OUTPUT_PSI_VALUE_FILE_NAME) log_file_path = os.path.join(log_directory_path, f'sample{sample_id}', CAMPAREE_CONSTANTS.TXQUANT_LOG_FILENAME) if os.path.isfile(transcript_count_filename) and \ os.path.isfile(gene_count_filename) and \ os.path.isfile(psi_value_filename) and \ os.path.isfile(log_file_path): #Read last line in log file line = "" with open(log_file_path, "r") as log_file: for line in log_file: line = line.rstrip() if line == "ALL DONE!": valid_output = True return valid_output
[docs] @staticmethod def main(): """ Entry point into script. Parses the argument list to obtain all the files needed and feeds them to the class constructor. Calls the appropriate methods thereafter. """ parser = argparse.ArgumentParser(description='Parse kallisto results and ' 'generate transcript, gene, ' 'and PSI quant files.') parser.add_argument('-l', '--log_directory_path', required=True, help="Path to log directory.") parser.add_argument('-d', '--data_directory_path', required=True, help='Path to data directory') parser.add_argument('--sample_id', required=True, help='Sample ID associated with input kallisto data.') parser.add_argument('-k', '--tx_abundance_file_path', required=True, help='Transcript-level abundance file generated by kallisto') parser.add_argument('-a', '--annotation_file_path', required=True, help='Annotation file mapping transcript IDs to gene IDs') args = parser.parse_args() transcript_gene_quant = TranscriptGeneQuantificationStep(log_directory_path=args.log_directory_path, data_directory_path=args.data_directory_path) transcript_gene_quant.execute(sample_id=args.sample_id, tx_abundance_file_path=args.tx_abundance_file_path, annotation_file_path=args.annotation_file_path)
if __name__ == "__main__": sys.exit(TranscriptGeneQuantificationStep.main())