Source code for camparee.transcriptomes

"""
Module containing a rough draft of transcriptome preparation including:
    1) creation of transcriptome fasta files from genome fasta files and annotations
    2) creation of STAR indexes for transcriptomes
    3) alignment of fastq files to STAR indexes
    4) Quantification at transcript and allele levels
    5) Expression of molecules based on quantifications

Runs these tasks in parallel through bsub
"""

import subprocess
import os
import sys
import argparse
import time
import pickle

import numpy

from beers_utils.constants import MAX_SEED
from beers_utils.sample import Sample
from camparee.gene_files_preparation import GeneFilesPreparation
from camparee.transcript_gene_quant import TranscriptGeneQuantificationStep
from camparee.allelic_imbalance_quant import AllelicImbalanceQuantificationStep
from camparee.molecule_maker import MoleculeMaker
from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS


# TODO: Split this class up into each of the separate steps. This way those that
#       are independent from each other can run in parallel.
# TODO: Update all code here and in the various steps to use the file names
#       encoded in CAMPAREE_CONSTANTS. Most stuff in here is currently hard-coded.
[docs]class TranscriptQuantificatAndMoleculeGenerationStep(AbstractCampareeStep): MOLECULES_PER_PACKET = 10_000 def __init__(self, log_directory_path, data_directory_path, parameters=dict()): self.data_directory_path = data_directory_path self.log_directory_path = log_directory_path self.num_bowtie_threads = parameters.get('num_bowtie_threads') if not self.num_bowtie_threads: self.num_bowtie_threads = 1
[docs] def validate(self): return True
[docs] def execute(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None): if seed is not None: numpy.random.seed(seed) sample_dir = os.path.join(self.data_directory_path, f"sample{sample.sample_id}") transcriptome_logfile_path = os.path.join(self.log_directory_path, f"sample{sample.sample_id}", CAMPAREE_CONSTANTS.TRANSCRIPTOMES_LOG_FILENAME) fastq_file_1, fastq_file_2 = sample.fastq_file_paths # Prepare the transcriptome fasta files for i in [1,2]: print(f"Preparing fasta files for transcriptome {i} of sample{sample.sample_id}") genome = os.path.join(sample_dir, f"custom_genome_{i}.fa") geneinfo = os.path.join(sample_dir, f"updated_annotation_{i}.txt") transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa") file_prep = GeneFilesPreparation(genome, geneinfo, transcriptome) file_prep.prepare_gene_files() # Build Kallisto indexes for i in [1, 2]: print(f"Building Kallisto indexes for transcriptome {i} of sample{sample.sample_id}") transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa") transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index") os.mkdir(transcriptome_index_dir) transcriptome_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}.kallisto.index") command = f"{kallisto_file_path} index -i {transcriptome_index} {transcriptome}" subprocess.run(command, shell=True, check=True) # Kallisto Quantification Step for i in [1, 2]: print(f"Running Kallisto quantification for transcriptome {i} of sample{sample.sample_id}") transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa") transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index") transcriptome_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}.kallisto.index") kallisto_output_dir = os.path.join(sample_dir, f"transcriptome_{i}_kallisto_quant") os.mkdir(kallisto_output_dir) command = f"{kallisto_file_path} quant -i {transcriptome_index} -o {kallisto_output_dir} {fastq_file_1} {fastq_file_2}" subprocess.run(command, shell=True, check=True) # Run transcript/gene/allelic_imbalance quantification scripts geneinfo_filename = os.path.join(sample_dir, "updated_annotation_1.txt") print(f"Quantifying transcriptome reads for sample{sample.sample_id}") transcript_gene_quant = TranscriptGeneQuantificationStep(geneinfo_filename, sample_dir) transcript_gene_quant.make_transcript_gene_dist_file() # Build Bowtie2 indexes for i in [1,2]: print(f"Building Bowtie2 indexes for transcriptome {i} of sample{sample.sample_id}") transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa") bowtie2_build_file_path = os.path.join(bowtie2_dir_path, f"bowtie2-build") transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index") transcriptome_bowtie2_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}") command = f"{bowtie2_build_file_path} --threads {self.num_bowtie_threads} {transcriptome} {transcriptome_bowtie2_index}" subprocess.run(command, shell=True, check=True) # Align fastq files to Bowtie2 indexes for i in [1, 2]: print(f"Aligning by Bowtie2 indexes for transcriptome {i} of sample{sample.sample_id}") transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa") bowtie2_file_path = os.path.join(bowtie2_dir_path, f"bowtie2") transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index") transcriptome_bowtie2_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}") aligned_file_path = os.path.join(sample_dir, f"{i}_Aligned.out.sam") command = f"{bowtie2_file_path} --threads {self.num_bowtie_threads} --very-sensitive -x {transcriptome_bowtie2_index} -1 {fastq_file_1} -2 {fastq_file_2} -S {aligned_file_path}" subprocess.run(command, shell=True, check=True) print(f"Quantifying allelic imabalance for sample{sample.sample_id}") allelic_imbalance_quant = AllelicImbalanceQuantificationStep(sample_dir, sample) allelic_imbalance_quant.quantify_allelic_imbalance() allelic_imbalance_quant.make_allele_imbalance_dist_file() # Run Molecule maker to generate a packet # TODO: what sample object can we use? print(f"Generating molecules for sample{sample.sample_id}") molecule_maker = MoleculeMaker(sample, sample_dir) if output_type == "packet": # TODO: potentially rounds down the number of molecules to make # TODO: MOLECULES_PER_PACKET maybe should not be a constant num_packets = output_molecule_count // TranscriptQuantificatAndMoleculeGenerationStep.MOLECULES_PER_PACKET for i in range(1,num_packets+1): print(f"Generating packet {i} of {num_packets} for sample{sample.sample_id}") packet = molecule_maker.make_packet(id=f"sample{sample.sample_id}.{i}") with open(os.path.join(sample_dir, f"molecule_packet{i}.pickle"), "wb") as out_file: pickle.dump(packet, out_file) elif output_type == "molecule_file": molecule_maker.make_molecule_file(filepath=os.path.join(sample_dir, f"molecule_file"), N = output_molecule_count) print(f"Done with transcriptome.py for sample{sample.sample_id}") # Output to DONE file with open(transcriptome_logfile_path, "w") as done_file: done_file.write("\nALL DONE!\n")
[docs] def get_commandline_call(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None): #Retrieve path to the transcriptomes.py script. transcriptomes_path = os.path.realpath(__file__) #If the above command returns a string with a "pyc" extension, instead #of "py", strip off "c" so it points to this script. transcriptomes_path = transcriptomes_path.rstrip('c') command = (f" python {transcriptomes_path}" f" --log_directory_path {self.log_directory_path}" f" --data_directory_path {self.data_directory_path}" f" --sample '{repr(sample)}'" f" --kallisto_file_path {kallisto_file_path}" f" --bowtie2_dir_path {bowtie2_dir_path}" f" --output_type {output_type}" f" --output_molecule_count {output_molecule_count}" f" --num_bowtie_threads {self.num_bowtie_threads}") if seed is not None: command += f" --seed {seed}" return command
[docs] def get_validation_attributes(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None): validation_attributes = {} validation_attributes['data_directory'] = self.data_directory_path validation_attributes['log_directory'] = self.log_directory_path validation_attributes['sample_id'] = sample.sample_id validation_attributes['output_molecule_count'] = output_molecule_count return validation_attributes
[docs] @staticmethod def main(): parser = argparse.ArgumentParser() parser.add_argument("--log_directory_path", help="Directory for log files") parser.add_argument("--data_directory_path", help="Directory for the data (input + output)") parser.add_argument("--sample", help="Serialized Sample object containing FASTQ file info.") parser.add_argument("--kallisto_file_path", help="path to Kallisto executable") parser.add_argument("--bowtie2_dir_path", help="path to Bowtie2 directory") parser.add_argument("--output_type", help="type of output file to write", choices=["packet", "molecule_file"], default="molecule_file") parser.add_argument("--output_molecule_count", help="number of molecules to output", type=int) parser.add_argument("--num_bowtie_threads", type=int, default=None, help="Number of threads to use for Bowtie alignment") parser.add_argument("--seed", help="seed for RNG", default=None, type=int) args = parser.parse_args() config_parameters = {} config_parameters['num_bowtie_threads'] = args.num_bowtie_threads transcriptomes = TranscriptQuantificatAndMoleculeGenerationStep(args.log_directory_path, args.data_directory_path, config_parameters) sample = eval(args.sample) transcriptomes.execute(sample, args.kallisto_file_path, args.bowtie2_dir_path, args.output_type, args.output_molecule_count, args.seed)
[docs] @staticmethod def is_output_valid(validation_attributes): data_directory = validation_attributes['data_directory'] log_directory = validation_attributes['log_directory'] sample_id = validation_attributes['sample_id'] output_molecule_count = validation_attributes['output_molecule_count'] valid_output = False transcriptome_logfile_path = os.path.join(log_directory, f"sample{sample_id}", CAMPAREE_CONSTANTS.TRANSCRIPTOMES_LOG_FILENAME) molecule_file = os.path.join(data_directory, f"sample{sample_id}", "molecule_file") if os.path.isfile(transcriptome_logfile_path) and \ os.path.isfile(molecule_file): #Read last line in transcriptomes log file line = "" with open(transcriptome_logfile_path, "r") as transcriptome_log_file: for line in transcriptome_log_file: line = line.rstrip() if line == "ALL DONE!": valid_output = True return valid_output
if __name__ == '__main__': sys.exit(TranscriptQuantificatAndMoleculeGenerationStep.main())