Source code for camparee.molecule_maker

import os
import json
import pathlib
import sys
import collections
import argparse
import numpy
import pickle

from camparee.abstract_camparee_step import AbstractCampareeStep
from camparee.camparee_constants import CAMPAREE_CONSTANTS

from beers_utils.molecule_packet import MoleculePacket
from beers_utils.molecule import Molecule
from beers_utils.sample import Sample
from beers_utils.cigar import chain_from_splits, split_cigar, query_seq_length
from beers_utils.general_utils import GeneralUtils
from beers_utils.read_fasta import read_fasta

[docs]class MoleculeMakerStep(AbstractCampareeStep):
    """
    MoleculeMaker generates molecules based off of gene, intron, and allelic
    quantification files as well as customized genomic sequence and annotation
    """

    OUTPUT_OPTIONS_W_EXTENSIONS=CAMPAREE_CONSTANTS.MOLECULE_MAKER_OUTPUT_OPTIONS_W_EXTENSIONS
    OUTPUT_FILENAME_PATTERN=CAMPAREE_CONSTANTS.MOLECULE_MAKER_OUTPUT_FILENAME_PATTERN
    DEFAULT_MOLECULES_PER_PACKET=CAMPAREE_CONSTANTS.MOLECULE_MAKER_DEFAULT_NUM_MOLECULES_PER_PACKET

    # Default filename patterns for output from previous CAMPAREE steps.
    # _GENE_QUANT_FILENAME=CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_GENE_FILENAME
    # _INTRON_QUANT_FILENAME=CAMPAREE_CONSTANTS.INTRON_OUTPUT_FILENAME
    # _TX_QUANT_PSI_FILENAME=CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_PSI_FILENAME
    #_ALLELIC_IMBALANCE_FILENAME=CAMPAREE_CONSTANTS.ALLELIC_IMBALANCE_OUTPUT_FILENAME
    _PARENTAL_TX_FASTA_FILENAME_PATTERN=CAMPAREE_CONSTANTS.TRANSCRIPTOME_FASTA_OUTPUT_FILENAME_PATTERN
    _PARENTAL_ANNOT_FILENAME_PATTERN=CAMPAREE_CONSTANTS.UPDATEANNOT_OUTPUT_FILENAME_PATTERN
    _PARENTAL_GENOME_FASTA_FILENAME_PATTERN=CAMPAREE_CONSTANTS.GENOMEBUILDER_SEQUENCE_FILENAME_PATTERN
    _PARENTAL_GENOME_INDEL_FILENAME_PATTERN=CAMPAREE_CONSTANTS.GENOMEBUILDER_INDEL_FILENAME_PATTERN

    def __init__(self, log_directory_path, data_directory_path=None, parameters=None):
        """Constructor for MoleculeMakerStep object.

        Parameters
        ----------
        log_directory_path : string
            Full path to log directory.
        parameters : dict
            Dictionary of other parameters specified by the config file. This
            parameter is not used by this class and is retained for uniformity
            with all other CAMPAREE steps.

        """
        self.log_directory_path = log_directory_path
        self.data_directory_path = data_directory_path
        self.min_polyA_tail_length = parameters.get("min_polyA_tail_length", 50)
        self.max_polyA_tail_length = parameters.get("max_polyA_tail_length", 250)
        self.parameters = parameters

    # Nearly all of the validation for this step is already performed in the
    # expression_pipeline, since the output options in the config file are
    # specified outside of the standard 'steps' framework.
[docs]    def validate(self):
        if (self.min_polyA_tail_length < 0):
            return False
        if (self.min_polyA_tail_length > self.max_polyA_tail_length):
            return False
        if (self.max_polyA_tail_length < 0):
            return False
        return True

    def load_annotation(self, file_path):
        transcripts = dict()
        with open(file_path) as annotation_file:
            for line in annotation_file:
                if line.startswith("#"):
                    continue # Comment/header line

                chrom, strand, tx_start, tx_end, exon_count, exon_starts, exon_ends, transcript_id, gene_id, gene_sybmol, *other \
                        = line.split("\t")

                transcripts[transcript_id] = (chrom, strand, int(tx_start), int(tx_end),
                                                [int(start) for start in exon_starts.split(",")],
                                                [int(end) for end in exon_ends.split(",")])
        return transcripts

[docs]    def load_indels(self, file_path, genome):
        """
        Read in the file of indel locations for a given custom genome

        :param file_path: path to the indel file
        :param genome: genomic sequences of this allele

        The indel file is tab-separated with format "chrom:start type length"  and looks like the following:
        1:4897762       I       2
        1:7172141       I       2
        1:7172378       D       1

        Assumption is that the file is sorted by start and no indels overlap

        Returns  a 'split cigar string' meaning a list of tuples (op, length)
        where op is one of M, I, D and length is the length of the match, insert, or deletion
        Good for use with beers_utils.cigar
        """
        genome_cigars = collections.defaultdict(lambda : collections.deque())
        last_indexes = collections.defaultdict(lambda : 0)
        with open(file_path) as indel_file:
            for line in indel_file:
                loc, indel_type, length = line.split('\t')
                chrom, start = loc.split(':')

                start = int(start)
                length = int(length)

                last_index = last_indexes[chrom]
                if last_index < start:
                    # Match up to the start of the indel
                    genome_cigars[chrom].append(
                        ('M', start - last_index)
                    )

                # Add the indel
                if indel_type == 'I':
                    genome_cigars[chrom].append(
                        ('I', length)
                    )
                    last_indexes[chrom] = start
                else:
                    genome_cigars[chrom].append(
                        ('D', length)
                    )
                    last_indexes[chrom] = start + length

        # Gather into a results
        results = dict()
        for chrom, sequence in genome.items():
            if chrom in genome_cigars:
                # Add the tail of the chromosome on, if necessary
                tail = len(sequence) - query_seq_length(genome_cigars[chrom])
                if tail > 0:
                    results[chrom] = list(genome_cigars[chrom]) + [('M', tail)]
                else:
                    results[chrom] = list(genome_cigars[chrom])
            else:
                # All match, no indels
                results[chrom] = [('M', len(sequence))]

        return results

[docs]    def load_intron_quants(self, file_path):
        """
        Load an intron quantification file as two dictionaries,
        (transcript ID -> sum FPK of all introns in transcript) and
        (transcript ID -> list of FPKs of each intron in transcript)
        """
        transcript_intron_quants = dict() # Dictionary transcript -> FPK for all introns in the transcript, combined
        intron_quants = dict() # Dictioanry transcript -> array of FPKs for each intron in the transcript

        with open(file_path) as intron_quants_file:
            for line in intron_quants_file:

                if line.startswith("#"):
                    continue # Comment/header line

                gene, transcript, chrom, strand, transcript_intron_reads_FPK, intron_reads_FPK = line.strip().split("\t")

                transcript_intron_quants[transcript] = float(transcript_intron_reads_FPK)
                intron_quants[transcript] = [float(quant) for quant in intron_reads_FPK.split(",")]

        return transcript_intron_quants, intron_quants

[docs]    def load_gene_quants(self, file_path):
        """
        Read in a gene quantification file as two lists of gene IDs and of their read quantifications
        """
        genes = []
        gene_quants = []

        with open(file_path) as gene_quant_file:
            for line in gene_quant_file:
                if line.startswith("#"):
                    continue # Comment/header line

                gene, quant = line.strip().split("\t")

                genes.append(gene)
                gene_quants.append(float(quant))

        return genes, numpy.array(gene_quants)

[docs]    def load_isoform_quants(self, file_path):
        """
        Reads an isoform quant file into a dictionary gene -> (list of transcript IDs, list of psi values)
        """
        isoform_quants = dict()

        with open(file_path) as isoform_quant_file:
            for line in isoform_quant_file:

                if line.startswith("#"):
                    continue # Comment/header line

                gene, entries = line.strip().split("\t")
                isoforms = [entry.split(":") for entry in entries.split(",")]
                isoforms = [(isoform, float(psi)) for isoform, psi in isoforms]
                isoform_list = [isoform for isoform, psi in isoforms]
                psi_list = [psi for isoform, psi in isoforms]

                isoform_quants[gene] = (isoform_list, psi_list)

        return isoform_quants

[docs]    def load_allelic_quants(self, file_path):
        """
        Reads allelic quantification file into a dictionary: gene_id -> (allele 1 probability, allele 2 probability)
        """

        allelic_quant = dict()

        with open(file_path) as allele_quant_file:
            for line in allele_quant_file:
                if line.startswith("#"):
                    continue # Comment/header line

                gene, allele1, allele2 = line.split("\t")
                allele1 = float(allele1)
                allele2 = float(allele2)

                allelic_quant[gene] = (allele1, allele2)
        return allelic_quant

    def make_molecule(self, sample, rng):
        # Pick random gene
        gene_index = rng.choice(len(self.genes), p=self.gene_probabilities)
        gene = self.genes[gene_index]
        gene_quant = self.gene_quants[gene_index]

        # Pick random transcript in gene
        transcripts, psis = self.isoform_quants[gene]
        transcript = rng.choice(transcripts, p=psis)

        # Pick random allele based on the gene's allelic distribution
        allele_number = rng.choice([1,2], p=self.allelic_quant[gene])

        # Read in annotation for the chosen transcript
        chrom,strand,tx_start,tx_end,starts,ends= self.annotations[allele_number - 1][transcript]

        # Determine if pre_mRNA or mature mRNA
        intron_quant = self.transcript_intron_quants[transcript]
        # TODO: check that this gives the appropriate fraction as pre_mRNA
        #       previously was using intron_quant / (intron_quant + gene_quant)
        #       but if assuming everything is either full pre_mRNA or mature mRNA then this should be
        #       the right fraction, which could happen to be greater than one (!)
        try:
            fraction_pre_mRNA = min(intron_quant / (gene_quant), 1)
        except ZeroDivisionError:
            # Should not ever get here since a gene with 0 gene_quant should have 0 chance of being chosen
            # however, if we do, we will just always give pre_mRNA
            fraction_pre_mRNA = 1.0

        pre_mRNA = rng.uniform() < fraction_pre_mRNA
        if pre_mRNA:
            # If chosen to be pre_mRNA, overwrite the usual exon starts/ends with a single, big "exon"
            starts = [tx_start]
            ends = [tx_end]

        # Find cigar string relative to the reference genome (i.e. custom_genome_1 or custom_genome_2)
        gaps = [next_start - last_end - 1 for next_start,last_end in zip(starts[1:],ends[:-1])]
        cigar = ''.join( f"{end - start + 1}M{gap}N" for start,end,gap in zip(starts[:-1],ends[:-1],gaps)) \
                    + f"{ends[-1] - starts[-1] + 1}M"

        cigar_split = split_cigar(cigar)
        ref_start, ref_cigar, _ = chain_from_splits(
                starts[0], cigar_split, strand,
                1, self.genome_cigar_splits[allele_number - 1][chrom], "+"
        )

        transcript_id = f"{sample.sample_id}_{transcript}_{allele_number}{'_pre_mRNA' if pre_mRNA else ''}"


        # Build the actual sequence
        chrom_sequence = self.genomes[allele_number - 1][chrom]
        sequence = ''.join( chrom_sequence[start-1:end] for start,end in zip(starts, ends) )

        if strand == '-':
            # We always give the sequence from 5' to 3' end of the RNA molecule
            # so reverse complement this
            sequence = GeneralUtils.create_complement_strand(sequence)
            # NOTE: cigar string stays the same since that is relative to the + strand

        # TODO: for now, everything gets polyA but maybe shouldn't
        polyA_tail = True
        if polyA_tail:
            # TODO: polyA tails should vary in length
            # Add polyA tail to 3' end
            polyA_length = rng.integers(self.min_polyA_tail_length, self.max_polyA_tail_length + 1)
            sequence = sequence + "A"*polyA_length
            # Soft-clip the polyA tail at the end since it shouldn't align
            if strand == "+":
                cigar = cigar + f"{polyA_length}S"
                ref_cigar = ref_cigar + f"{polyA_length}S"
            else:
                cigar =   f"{polyA_length}S" + cigar # Relative to + strand, the A's are going on the 5' end
                ref_cigar = f"{polyA_length}S" + ref_cigar


        return sequence, starts[0], cigar, ref_start, ref_cigar, strand, chrom, transcript_id

    def make_packet(self, sample, rng, id="packet0", N=10_000):
        molecules = []
        for i in range(N):
            sequence, start, cigar, ref_start, ref_cigar, strand, chrom, transcript_id = self.make_molecule(sample, rng)
            mol = Molecule(
                    Molecule.new_id(transcript_id),
                    sequence,
                    start = start, # relative to the true ('parental') genome
                    cigar = cigar,
                    strand = strand,
                    source_start = ref_start, # relative to the reference genome
                    source_cigar = ref_cigar,
                    source_strand = strand,
                    transcript_id = transcript_id,
                    source_chrom = chrom)
            molecules.append(mol)
        return MoleculePacket(id, sample, molecules)

[docs]    def make_molecule_file(self, filepath, sample, rng, N=10_000):
        """
        Write out molecules to a tab-separated file

        Note: we write out a molecules start and cigar relative to the appropriate
        custom genome, either _1 or _2 as per the transcript id
        """
        with open(filepath, "w") as molecule_file:
            header = "#transcript_id\tchrom\tstart\tcigar\tref_start\tref_cigar\tstrand\tsequence\n"
            molecule_file.write(header)
            for i in range(N):
                sequence, start, cigar, ref_start, ref_cigar, strand, chrom, transcript_id = self.make_molecule(sample, rng)
                line = "\t".join([transcript_id,
                                  chrom,
                                  str(start),
                                  cigar,
                                  str(ref_start),
                                  ref_cigar,
                                  strand,
                                  sequence]
                                  ) + "\n"

                molecule_file.write(line)

[docs]    def execute(self, sample, sample_data_directory, output_type, output_molecule_count, seed=None,
                molecules_per_packet=None, rng=None):
        """This is the main method that generates simulated molecules and saves/
        exports them in the desired format. It uses the gene, transcript, intron,
        and allelic imbalance distributions generated by the other CAMPAREE steps.


        Parameters
        ----------
        sample : Sample
            Sample object corresponding to the input distributions. When exporting
            molecule packets, this Sample object is used to instantiate the
            MoleculePacket object.
        sample_data_directory : string
            Path to directory containing the data for the sample.
        output_type : string
            Type of file or object used to save or export simulated molecules.
            Sould be one of {', '.join(MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS.keys())}.
        output_molecule_count : integer
            Total number of molecules to save/export for the current Sample.
        seed : integer
            [OPTIONAL] Seed for random number generator. Used so repeated runs
            can produce the same results.
        molecules_per_packet : integer
            [OPTIONAL] Maximum number of molecules in each molecule packet. Must
            be positive, non-zero integer (this is not currently checked).
        rng: numpy Generator
            [OPTIONAL] If provided, will use this for generating random numbers. Otherwise,
            uses default RNG
        """

        if rng is None:
            rng = numpy.random.default_rng(seed)

        sample_log_dir = pathlib.Path(self.log_directory_path) / f'sample{sample.sample_id}'
        print(sample_log_dir.resolve())
        sample_log_dir.mkdir(exist_ok=True)
        log_file_path = sample_log_dir / CAMPAREE_CONSTANTS.MOLECULE_MAKER_LOG_FILENAME
        output_file_extension = MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS[output_type]

        if not molecules_per_packet:
            molecules_per_packet=MoleculeMakerStep.DEFAULT_MOLECULES_PER_PACKET

        with open(log_file_path, "w") as log_file:

            print(f"Generating molecules for sample{sample.sample_id}.")
            log_file.write(f"Generating molecules for sample{sample.sample_id}.\n")

            intron_quant_path = os.path.join(sample_data_directory, CAMPAREE_CONSTANTS.INTRON_OUTPUT_FILENAME)
            gene_quant_path = os.path.join(sample_data_directory, CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_GENE_FILENAME)
            psi_quant_path = os.path.join(sample_data_directory, CAMPAREE_CONSTANTS.TXQUANT_OUTPUT_PSI_FILENAME)
            allele_quant_path = os.path.join(sample_data_directory, CAMPAREE_CONSTANTS.ALLELIC_IMBALANCE_OUTPUT_FILENAME)

            log_file.write(f"Parameters:\n"
                           f"    Output file type: {output_type}\n"
                           f"    Output file extension: {output_file_extension}\n"
                           f"    Num molecules to generate: {output_molecule_count}\n"
                           f"    Num molecules per packet: {molecules_per_packet}\n"
                           f"    Random seed value: {seed}\n")

            log_file.write(f"Distribution files:\n"
                           f"    Intron quant file: {intron_quant_path}\n"
                           f"    Gene quant file: {gene_quant_path}\n"
                           f"    PSI quant file: {psi_quant_path}\n"
                           f"    Allele quant file: {allele_quant_path}\n")

            print('Loading gene, intron, transcript PSI, and allelic imbalance'
                  ' distributions.')
            log_file.write('Loading gene, intron, transcript PSI, and allelic'
                           ' imbalance distributions.\n')

            # Read and load data from gene, intron, transcript PSI, and allelic
            # imbalance distribution files.
            self.genes, self.gene_quants = self.load_gene_quants(gene_quant_path)
            self.gene_probabilities = self.gene_quants / numpy.sum(self.gene_quants)

            self.transcript_intron_quants, self.intron_quants = self.load_intron_quants(intron_quant_path)
            self.isoform_quants = self.load_isoform_quants(psi_quant_path)
            self.allelic_quant = self.load_allelic_quants(allele_quant_path)

            print('Loading annotations, transcriptome sequences, and genome sequences'
                  ' from both parental genomes.')
            log_file.write('Loading annotations, transcriptome sequences, and genome'
                           ' sequences from both parental genomes.\n')

            # Read and load annotations, as well as full transcriptome and genome
            # sequences for each parental genome. This information is used when
            # generating the simulated molecule sequences.
            self.annotations = \
                [self.load_annotation(os.path.join(sample_data_directory,
                                                   self._PARENTAL_ANNOT_FILENAME_PATTERN.format(genome_name=genome_name)))
                    for genome_name in [1,2]]
            self.genomes = \
                [read_fasta(os.path.join(sample_data_directory,
                                           self._PARENTAL_GENOME_FASTA_FILENAME_PATTERN.format(genome_name=genome_name)),
                            replace_Ns = True,
                            rng = rng)
                    for genome_name in [1,2]]

            print('Loading indel information from both parental genomes.')
            log_file.write('Loading indel information from both parental genomes.\n')

            # Read and load indel data for each parental genome. This information is
            # used when constructing CIGAR strings mapping transcripts back to their
            # locations in the original reference genome.
            self.genome_cigar_splits =  [self.load_indels(
                                                os.path.join(sample_data_directory,
                                                   self._PARENTAL_GENOME_INDEL_FILENAME_PATTERN.format(genome_name=genome_name)),
                                               self.genomes[genome_name-1])
                                            for genome_name in [1,2]]

            # Generate molecules and save/export them according to output type.
            print('Generating molecules and saving/exporting the results.')
            log_file.write('Generating molecules and saving/exporting the results.')
            print(f"Molecule maker output type {repr(output_type)}")
            if output_type == "packet":
                # TODO: potentially rounds down the number of molecules to make
                num_packets = output_molecule_count // molecules_per_packet
                for i in range(1,num_packets+1):
                    print(f"    Generating packet {i} of {num_packets}")
                    log_file.write(f"    Generating packet {i} of {num_packets}\n")
                    packet = self.make_packet(sample=sample, id=f"sample{sample.sample_id}.{i}", N=molecules_per_packet, rng=rng) #TODO: id needs to be an integer

                    molecule_packet_filename = os.path.join(sample_data_directory,
                                                            self.OUTPUT_FILENAME_PATTERN.format(output_type=output_type,
                                                                                                packet_num=i,
                                                                                                extension=output_file_extension))
                    with open(molecule_packet_filename, "wb") as out_file:
                        pickle.dump(packet, out_file)
            elif output_type == "file":
                molecule_output_filename = os.path.join(sample_data_directory,
                                                        self.OUTPUT_FILENAME_PATTERN.format(output_type=output_type,
                                                                                            packet_num="",
                                                                                            extension=output_file_extension))
                print(f"Generating molecule file {molecule_output_filename}.")
                log_file.write(f"Generating molecule file {molecule_output_filename}.")
                self.make_molecule_file(filepath=molecule_output_filename,
                                        N = output_molecule_count,
                                        sample = sample,
                                        rng = rng)
            elif output_type == "generator":
                def generator():
                    num_packets = output_molecule_count // molecules_per_packet
                    print(f"Generating {num_packets} packets")
                    for i in range(1, num_packets+1):
                        packet = self.make_packet(sample=sample, id=i, N=molecules_per_packet, rng=rng)
                        yield packet
                return generator()
            else:
                raise ValueError(f"Expected output_type to be 'packet', 'file', or 'generator'. Instead got {repr(output_type)}")

            log_file.write("\nALL DONE!\n")

[docs]    def get_commandline_call(self, sample, sample_data_directory,
                             output_type, output_molecule_count,
                             seed=None,
                             molecules_per_packet=None):
        """Prepare command to execute the MoleculeMakerStep from the command line,
        given all of the arugments used to run the execute() function.

        Parameters
        ----------
        sample : Sample
            Sample object corresponding to the input distributions. When exporting
            molecule packets, this Sample object is used to instantiate the
            MoleculePacket object.
        sample_data_directory : string
            Path to directory containing the sample data
        output_type : string
            Type of file or object used to save or export simulated molecules.
            Sould be one of {', '.join(MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS.keys())}.
        output_molecule_count : integer
            Total number of molecules to save/export for the current Sample.
        seed : integer
            [OPTIONAL] Seed for random number generator. Used so repeated runs
            can produce the same results.
        molecules_per_packet : integer
            [OPTIONAL] Maximum number of molecules in each molecule packet. Must
            be positive, non-zero integer (this is not currently checked).

        Returns
        -------
        string
            Command to execute on the command line. It will perform the same
            operations as a call to execute() with the same parameters.

        """
        #Retrieve path to the allelic_imbalance_quant.py script.
        molecule_maker_step_path = os.path.realpath(__file__)
        #If the above command returns a string with a "pyc" extension, instead
        #of "py", strip off "c" so it points to this script.
        molecule_maker_step_path = molecule_maker_step_path.rstrip('c')

        command = (f" python {molecule_maker_step_path}"
                   f" --parameters '{json.dumps(self.parameters)}'"
                   f" --log_directory_path {self.log_directory_path}"
                   f" --sample_data_directory {sample_data_directory}"
                   f" --sample '{repr(sample)}'"
                   f" --output_type {output_type}"
                   f" --output_molecule_count {output_molecule_count}")
        if seed is not None:
            command += f" --seed {seed}"
        if molecules_per_packet:
            command += f" --molecules_per_packet {molecules_per_packet}"

        return command

[docs]    def get_validation_attributes(self, sample, sample_data_directory,
                                  output_type,
                                  output_molecule_count,
                                  seed=None,
                                  molecules_per_packet=None):
        """Prepare attributes required by is_output_valid() function to validate
        output generated by the MoleculeMakerStep job.

        Parameters
        ----------
        sample : Sample
            Sample object corresponding to the input distributions. When exporting
            molecule packets, this Sample object is used to instantiate the
            MoleculePacket object.
        sample_data_path : string
            Path to directory containing all the sample data.
        output_type : string
            Type of file or object used to save or export simulated molecules.
            Sould be one of {', '.join(MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS.keys())}.
        output_molecule_count : integer
            Total number of molecules to save/export for the current Sample.
        seed : integer
            [OPTIONAL] Seed for random number generator. Used so repeated runs
            can produce the same results. [Note: this parameter is captured just
            so get_validation_attributes() accepts the same arguments as
            get_commandline_call(). It is not used here.]
        molecules_per_packet : integer
            [OPTIONAL] Maximum number of molecules in each molecule packet. Must
            be positive, non-zero integer (this is not currently checked).

        Returns
        -------
        dict
            A MoleculeMakerStep job's sample_data_directory, log_directory, corresponding
            sample ID, output file type, output molecule count, and the number of
            molecules per packet.
        """
        validation_attributes = {}
        validation_attributes['log_directory'] = self.log_directory_path
        validation_attributes['sample_data_directory'] = sample_data_directory
        validation_attributes['sample_id'] = sample.sample_id
        validation_attributes['output_type'] = output_type
        validation_attributes['output_molecule_count'] = output_molecule_count
        validation_attributes['molecules_per_packet'] = molecules_per_packet
        return validation_attributes

[docs]    @staticmethod
    def is_output_valid(validation_attributes):
        """Check if output of MoleculeMakerStep for a specific job/execution is
        correctly formed and valid, given a job's data directory, log directory,
        sample ID, output file type, output molecule count, and the number of
        molecules per packet (if provided). Prepare these attributes for a given
        job using the get_validation_attributes() method.

        Parameters
        ----------
        validation_attributes : dict
            A job's data_directory, log_directory, corresponding sample_id,
            output file type, output molecule count, and the number of molecules
            per packet (if provided).

        Returns
        -------
        boolean
            True  - MoleculeMakerStep output files were created and are well formed.
            False - MoleculeMakerStep output files do not exist or are missing data.

        """

        sample_data_directory = validation_attributes['sample_data_directory']
        log_directory_path = validation_attributes['log_directory']
        sample_id = validation_attributes['sample_id']
        output_type = validation_attributes['output_type']
        output_molecule_count = validation_attributes['output_molecule_count']
        molecules_per_packet = validation_attributes.get('molecules_per_packet')
        output_file_extension = MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS[output_type]

        if not molecules_per_packet:
            molecules_per_packet = MoleculeMakerStep.DEFAULT_MOLECULES_PER_PACKET

        valid_output = False

        # Construct output filenames/paths
        log_file_path = os.path.join(log_directory_path, f'sample{sample_id}',
                                     CAMPAREE_CONSTANTS.MOLECULE_MAKER_LOG_FILENAME)

        # Check existence of output files. If output type is molecule packet,
        # there will be multiple output files and this code will check that they
        # all exist (based on number of molecules per packet).
        all_molecule_files_exist = False
        if output_type == "packet":
            total_num_packets = output_molecule_count // molecules_per_packet
            num_packets_exist = 0
            for i in range(1, total_num_packets+1):
                molecule_packet_file = os.path.join(sample_data_directory,
                                                    MoleculeMakerStep.OUTPUT_FILENAME_PATTERN.format(output_type=output_type,
                                                                                                     packet_num=i,
                                                                                                     extension=output_file_extension))
                if os.path.isfile(molecule_packet_file):
                    num_packets_exist += 1

            if total_num_packets == num_packets_exist:
                all_molecule_files_exist = True

        elif output_type == "file":
            molecule_file = os.path.join(sample_data_directory,
                                         MoleculeMakerStep.OUTPUT_FILENAME_PATTERN.format(output_type=output_type,
                                                                                          packet_num="",
                                                                                          extension=output_file_extension))
            all_molecule_files_exist = os.path.isfile(molecule_file)
        elif output_type == "generator":
            pass # Always valid
        else:
            # Unknown output type specified
            valid_output = False


        # TODO: Report reason why out validation failed.

        if all_molecule_files_exist and os.path.isfile(log_file_path):

            #Read last line in log file
            line = ""
            with open(log_file_path, "r") as log_file:
                for line in log_file:
                    line = line.rstrip()
            if line == "ALL DONE!":
                valid_output = True

        return valid_output

[docs]    @staticmethod
    def main():
        """Entry point into script. Parses the argument list to obtain all the
        files needed and feeds them to the class constructor. Calls the appropriate
        methods thereafter.
        """
        parser = argparse.ArgumentParser(description='Generate simulated molecules and export/save.')
        parser.add_argument('-l', '--log_directory_path', required=True,
                            help="Path to log directory.")
        parser.add_argument('--parameters', required=True,
                            help="JSON of parameters")
        parser.add_argument('-d', '--sample_data_directory', required=True,
                            help='Path to sample data directory')
        parser.add_argument('--sample', required=True,
                            help='String representation of a Sample object.')
        parser.add_argument('--output_type', required=True,
                            help=f"Type of molecule output ({', '.join(MoleculeMakerStep.OUTPUT_OPTIONS_W_EXTENSIONS.keys())}).")
        parser.add_argument('--output_molecule_count', type=int, required=True,
                            help='Number of molecules to generate.')
        parser.add_argument('--seed', type=int, default=None, required=False,
                            help='Seed value for random number generator.')
        parser.add_argument('--molecules_per_packet', type=int, default=None, required=False,
                            help='Number of molecules per molecule packet. '
                                 'Only used if output_type set to "packet".')
        args = parser.parse_args()
        sample = eval(args.sample)

        molecule_maker = MoleculeMakerStep(
                log_directory_path=args.log_directory_path,
                parameters = json.loads(args.parameters))
        molecule_maker.execute(sample=sample,
                               sample_data_directory=args.sample_data_directory,
                               output_type=args.output_type,
                               output_molecule_count=args.output_molecule_count,
                               seed=args.seed,
                               molecules_per_packet=args.molecules_per_packet)

if __name__ == "__main__":
    sys.exit(MoleculeMakerStep.main())
Source code for camparee.molecule_maker

CAMPAREE

Navigation

Related Topics