"""
Module containing a rough draft of transcriptome preparation including:
1) creation of transcriptome fasta files from genome fasta files and annotations
2) creation of STAR indexes for transcriptomes
3) alignment of fastq files to STAR indexes
4) Quantification at transcript and allele levels
5) Expression of molecules based on quantifications
Runs these tasks in parallel through bsub
"""
import subprocess
import os
import sys
import argparse
import time
import pickle
import numpy
from beers_utils.constants import MAX_SEED
from beers_utils.sample import Sample
from camparee.gene_files_preparation import GeneFilesPreparation
from camparee.transcript_gene_quant import TranscriptGeneQuantificationStep
from camparee.allelic_imbalance_quant import AllelicImbalanceQuantificationStep
from camparee.molecule_maker import MoleculeMaker
from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS
# TODO: Split this class up into each of the separate steps. This way those that
# are independent from each other can run in parallel.
# TODO: Update all code here and in the various steps to use the file names
# encoded in CAMPAREE_CONSTANTS. Most stuff in here is currently hard-coded.
[docs]class TranscriptQuantificatAndMoleculeGenerationStep(AbstractCampareeStep):
MOLECULES_PER_PACKET = 10_000
def __init__(self, log_directory_path, data_directory_path, parameters=dict()):
self.data_directory_path = data_directory_path
self.log_directory_path = log_directory_path
self.num_bowtie_threads = parameters.get('num_bowtie_threads')
if not self.num_bowtie_threads:
self.num_bowtie_threads = 1
[docs] def validate(self):
return True
[docs] def execute(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None):
if seed is not None:
numpy.random.seed(seed)
sample_dir = os.path.join(self.data_directory_path, f"sample{sample.sample_id}")
transcriptome_logfile_path = os.path.join(self.log_directory_path, f"sample{sample.sample_id}",
CAMPAREE_CONSTANTS.TRANSCRIPTOMES_LOG_FILENAME)
fastq_file_1, fastq_file_2 = sample.fastq_file_paths
# Prepare the transcriptome fasta files
for i in [1,2]:
print(f"Preparing fasta files for transcriptome {i} of sample{sample.sample_id}")
genome = os.path.join(sample_dir, f"custom_genome_{i}.fa")
geneinfo = os.path.join(sample_dir, f"updated_annotation_{i}.txt")
transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa")
file_prep = GeneFilesPreparation(genome, geneinfo, transcriptome)
file_prep.prepare_gene_files()
# Build Kallisto indexes
for i in [1, 2]:
print(f"Building Kallisto indexes for transcriptome {i} of sample{sample.sample_id}")
transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa")
transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index")
os.mkdir(transcriptome_index_dir)
transcriptome_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}.kallisto.index")
command = f"{kallisto_file_path} index -i {transcriptome_index} {transcriptome}"
subprocess.run(command, shell=True, check=True)
# Kallisto Quantification Step
for i in [1, 2]:
print(f"Running Kallisto quantification for transcriptome {i} of sample{sample.sample_id}")
transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa")
transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index")
transcriptome_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}.kallisto.index")
kallisto_output_dir = os.path.join(sample_dir, f"transcriptome_{i}_kallisto_quant")
os.mkdir(kallisto_output_dir)
command = f"{kallisto_file_path} quant -i {transcriptome_index} -o {kallisto_output_dir} {fastq_file_1} {fastq_file_2}"
subprocess.run(command, shell=True, check=True)
# Run transcript/gene/allelic_imbalance quantification scripts
geneinfo_filename = os.path.join(sample_dir, "updated_annotation_1.txt")
print(f"Quantifying transcriptome reads for sample{sample.sample_id}")
transcript_gene_quant = TranscriptGeneQuantificationStep(geneinfo_filename, sample_dir)
transcript_gene_quant.make_transcript_gene_dist_file()
# Build Bowtie2 indexes
for i in [1,2]:
print(f"Building Bowtie2 indexes for transcriptome {i} of sample{sample.sample_id}")
transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa")
bowtie2_build_file_path = os.path.join(bowtie2_dir_path, f"bowtie2-build")
transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index")
transcriptome_bowtie2_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}")
command = f"{bowtie2_build_file_path} --threads {self.num_bowtie_threads} {transcriptome} {transcriptome_bowtie2_index}"
subprocess.run(command, shell=True, check=True)
# Align fastq files to Bowtie2 indexes
for i in [1, 2]:
print(f"Aligning by Bowtie2 indexes for transcriptome {i} of sample{sample.sample_id}")
transcriptome = os.path.join(sample_dir, f"transcriptome_{i}.fa")
bowtie2_file_path = os.path.join(bowtie2_dir_path, f"bowtie2")
transcriptome_index_dir = os.path.join(sample_dir, f"transcriptome_{i}_index")
transcriptome_bowtie2_index = os.path.join(transcriptome_index_dir, f"transcriptome_{i}")
aligned_file_path = os.path.join(sample_dir, f"{i}_Aligned.out.sam")
command = f"{bowtie2_file_path} --threads {self.num_bowtie_threads} --very-sensitive -x {transcriptome_bowtie2_index} -1 {fastq_file_1} -2 {fastq_file_2} -S {aligned_file_path}"
subprocess.run(command, shell=True, check=True)
print(f"Quantifying allelic imabalance for sample{sample.sample_id}")
allelic_imbalance_quant = AllelicImbalanceQuantificationStep(sample_dir, sample)
allelic_imbalance_quant.quantify_allelic_imbalance()
allelic_imbalance_quant.make_allele_imbalance_dist_file()
# Run Molecule maker to generate a packet
# TODO: what sample object can we use?
print(f"Generating molecules for sample{sample.sample_id}")
molecule_maker = MoleculeMaker(sample, sample_dir)
if output_type == "packet":
# TODO: potentially rounds down the number of molecules to make
# TODO: MOLECULES_PER_PACKET maybe should not be a constant
num_packets = output_molecule_count // TranscriptQuantificatAndMoleculeGenerationStep.MOLECULES_PER_PACKET
for i in range(1,num_packets+1):
print(f"Generating packet {i} of {num_packets} for sample{sample.sample_id}")
packet = molecule_maker.make_packet(id=f"sample{sample.sample_id}.{i}")
with open(os.path.join(sample_dir, f"molecule_packet{i}.pickle"), "wb") as out_file:
pickle.dump(packet, out_file)
elif output_type == "molecule_file":
molecule_maker.make_molecule_file(filepath=os.path.join(sample_dir, f"molecule_file"),
N = output_molecule_count)
print(f"Done with transcriptome.py for sample{sample.sample_id}")
# Output to DONE file
with open(transcriptome_logfile_path, "w") as done_file:
done_file.write("\nALL DONE!\n")
[docs] def get_commandline_call(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None):
#Retrieve path to the transcriptomes.py script.
transcriptomes_path = os.path.realpath(__file__)
#If the above command returns a string with a "pyc" extension, instead
#of "py", strip off "c" so it points to this script.
transcriptomes_path = transcriptomes_path.rstrip('c')
command = (f" python {transcriptomes_path}"
f" --log_directory_path {self.log_directory_path}"
f" --data_directory_path {self.data_directory_path}"
f" --sample '{repr(sample)}'"
f" --kallisto_file_path {kallisto_file_path}"
f" --bowtie2_dir_path {bowtie2_dir_path}"
f" --output_type {output_type}"
f" --output_molecule_count {output_molecule_count}"
f" --num_bowtie_threads {self.num_bowtie_threads}")
if seed is not None:
command += f" --seed {seed}"
return command
[docs] def get_validation_attributes(self, sample, kallisto_file_path, bowtie2_dir_path, output_type, output_molecule_count, seed=None):
validation_attributes = {}
validation_attributes['data_directory'] = self.data_directory_path
validation_attributes['log_directory'] = self.log_directory_path
validation_attributes['sample_id'] = sample.sample_id
validation_attributes['output_molecule_count'] = output_molecule_count
return validation_attributes
[docs] @staticmethod
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--log_directory_path", help="Directory for log files")
parser.add_argument("--data_directory_path", help="Directory for the data (input + output)")
parser.add_argument("--sample", help="Serialized Sample object containing FASTQ file info.")
parser.add_argument("--kallisto_file_path", help="path to Kallisto executable")
parser.add_argument("--bowtie2_dir_path", help="path to Bowtie2 directory")
parser.add_argument("--output_type", help="type of output file to write", choices=["packet", "molecule_file"], default="molecule_file")
parser.add_argument("--output_molecule_count", help="number of molecules to output", type=int)
parser.add_argument("--num_bowtie_threads", type=int, default=None,
help="Number of threads to use for Bowtie alignment")
parser.add_argument("--seed", help="seed for RNG", default=None, type=int)
args = parser.parse_args()
config_parameters = {}
config_parameters['num_bowtie_threads'] = args.num_bowtie_threads
transcriptomes = TranscriptQuantificatAndMoleculeGenerationStep(args.log_directory_path,
args.data_directory_path,
config_parameters)
sample = eval(args.sample)
transcriptomes.execute(sample,
args.kallisto_file_path,
args.bowtie2_dir_path,
args.output_type,
args.output_molecule_count,
args.seed)
[docs] @staticmethod
def is_output_valid(validation_attributes):
data_directory = validation_attributes['data_directory']
log_directory = validation_attributes['log_directory']
sample_id = validation_attributes['sample_id']
output_molecule_count = validation_attributes['output_molecule_count']
valid_output = False
transcriptome_logfile_path = os.path.join(log_directory, f"sample{sample_id}",
CAMPAREE_CONSTANTS.TRANSCRIPTOMES_LOG_FILENAME)
molecule_file = os.path.join(data_directory, f"sample{sample_id}", "molecule_file")
if os.path.isfile(transcriptome_logfile_path) and \
os.path.isfile(molecule_file):
#Read last line in transcriptomes log file
line = ""
with open(transcriptome_logfile_path, "r") as transcriptome_log_file:
for line in transcriptome_log_file:
line = line.rstrip()
if line == "ALL DONE!":
valid_output = True
return valid_output
if __name__ == '__main__':
sys.exit(TranscriptQuantificatAndMoleculeGenerationStep.main())