Source code for cellmaps_utils.crisprtool

import json
import os
import shutil
import uuid
from datetime import date
import re
import logging
import cellmaps_utils
from cellmaps_utils.basecmdtool import BaseCommandLineTool
from cellmaps_utils.exceptions import CellMapsError
from cellmaps_utils import constants
from cellmaps_utils.provenance import ProvenanceUtil

logger = logging.getLogger(__name__)



[docs]
class CRISPRDataLoader(BaseCommandLineTool):
    """
    Creates RO-Crate of CRISPR data from
    raw CRISPR data files
    """
    COMMAND = 'crisprconverter'

    def __init__(self, theargs,
                 provenance_utils=ProvenanceUtil()):
        """
        Constructor

        :param theargs: Command line arguments that at minimum need
                        to have the following attributes:
        :type theargs: :py:class:`~python.argparse.Namespace`
        """
        super().__init__()
        self._outdir = os.path.abspath(theargs.outdir)
        self._name = theargs.name
        self._organization_name = theargs.organization_name
        self._project_name = theargs.project_name
        self._release = theargs.release
        self._cell_line = theargs.cell_line
        self._tissue = theargs.tissue
        self._treatment = theargs.treatment
        self._author = theargs.author
        self._gene_set = theargs.gene_set
        self._h5ad = theargs.h5ad
        self._dataset = theargs.dataset
        self._skipcopy = theargs.skipcopy
        self._num_perturb_guides = theargs.num_perturb_guides
        self._num_non_target_ctrls = theargs.num_non_target_ctrls
        self._num_screen_targets = theargs.num_screen_targets
        self._provenance_utils = provenance_utils
        self._softwareid = None
        self._input_data_dict = theargs.__dict__


[docs]
    def run(self):
        """
        Runs the process of CRISPR data loading into a RO-Crate.
        It includes generating the output directory,
        linking and registering h5ad file and registering the
        computation and software used in the process.

        :return:
        """
        self._generate_rocrate_dir_path()
        if os.path.exists(self._outdir):
            raise CellMapsError(self._outdir + ' already exists')

        logger.debug('Creating directory ' + str(self._outdir))
        os.makedirs(self._outdir, mode=0o755)

        keywords = [self._project_name, self._release,
                    self._cell_line, self._treatment, 'CRISPR',
                    self._tissue, self._dataset]

        if self._gene_set is not None:
            keywords.append(self._gene_set)

        description = ' '.join(keywords)

        info_dict = {
            constants.DATASET_NAME: self._name,
            constants.DATASET_ORGANIZATION_NAME: self._organization_name,
            constants.DATASET_PROJECT_NAME: self._project_name,
            constants.DATASET_RELEASE: self._release,
            constants.DATASET_CELL_LINE: self._cell_line,
            constants.DATASET_TREATMENT: self._treatment,
            constants.DATASET_TISSUE: self._tissue,
            constants.DATASET_AUTHOR: self._author,
            constants.DATASET_GENE_SET: self._gene_set,
            constants.DATASET_COLLECTION_SET: self._dataset
        }
        self.save_dataset_info_to_json(self._outdir, info_dict,
                                       constants.DATASET_INFO_FILE)

        self._provenance_utils.register_rocrate(self._outdir,
                                                name=self._name,
                                                organization_name=self._organization_name,
                                                project_name=self._project_name,
                                                description=description,
                                                keywords=keywords,
                                                guid=self._get_fairscape_id())
        gen_dsets = []

        gen_dsets.extend(self._link_and_register_h5ad(keywords=keywords,
                                                      description=description))
        self._register_software(keywords=keywords, description=description)
        self._register_computation(generated_dataset_ids=gen_dsets,
                                   description=description,
                                   keywords=keywords)
        self._copy_over_crispr_readme()
        return 0


    def _get_fairscape_id(self):
        """
        Creates a unique id
        :return:
        """
        return str(uuid.uuid4()) + ':' + os.path.basename(self._outdir)

    def _get_dataset_description(self):
        """
        Provides a description of the dataset based on its type (1channel or subset).

        :return: A string description of the dataset.
        :rtype: str
        """
        if self._dataset.lower() == '1channel':
            return 'FASTQs file were obtain by concatenating 3 NGS run over 7 sequencing lanes'
        elif self._dataset.lower() == 'subset':
            return 'Subset run'
        return ''

    def _link_and_register_h5ad(self, description='',
                                keywords=[]):
        """
        Processes expression file by optionally copying it to the output directory and
        registering the file in the
        RO-Crate metadata.

        :param description: A base description for the files being processed.
        :type description: str
        :param keywords: A list of keywords associated with these files for metadata purposes.
        :type keywords: list
        :return: A list of dataset identifiers for the registered expression files.
        :rtype: list
        """
        dset_ids = []

        dest_file = os.path.join(self._outdir, constants.PERTURBATION_FILE)
        if self._skipcopy is True:
            open(dest_file, 'a').close()
        else:
            self._link_or_copy(self._h5ad, dest_file)
        file_desc = description + ' file'
        file_keywords = keywords.copy()
        file_keywords.extend(['file'])
        dset_id = self._provenance_utils.register_dataset(rocrate_path=self._outdir, source_file=dest_file,
                                                          skip_copy=True,
                                                          data_dict={'name': 'Single pooled crispr screens analysis',
                                                                     'description': file_desc,
                                                                     'keywords': file_keywords,
                                                                     'data-format': 'h5ad',
                                                                     'author': self._author,
                                                                     'version': self._release,
                                                                     'date-published': date.today().strftime(
                                                                         '%Y-%m-%d')},
                                                          guid=self._get_fairscape_id())
        dset_ids.append(dset_id)

        return dset_ids

    def _link_or_copy(self, src, dest):
        """
        Attempts to hardlink src to dest and if
        that fails perform a regular copy

        :param src:
        :param dest:
        :return:
        """
        try:
            os.link(src, dest)
        except OSError as e:
            logger.warning('Falling back to copy because unable to '
                           'hardlink ' + src + ' to ' +
                           dest + ' : ' + str(e))
            shutil.copy(src, dest)

    def _create_token_replacement_map(self):
        """
        Generates a map of tokens to their respective replacement values for use in modifying the CRISPR readme file.

        :return: A dictionary where each key is a token to replace and each value is the replacement string.
        :rtype: dict
        """
        return {'@@H5AD@@': constants.PERTURBATION_FILE,
                '@@CELL_LINE@@': self._cell_line,
                '@@TREATMENT@@': self._treatment,
                '@@NUM_SCREEN_TARGETS_AND_GENE_SET@@': str(self._num_screen_targets) +
                                                       ' ' + self._gene_set,
                '@@NUM_NON_TARGET_CTRLS@@': str(self._num_non_target_ctrls),
                '@@NUM_PERTURB_GUIDES@@': str(self._num_perturb_guides)}

    def _copy_over_crispr_readme(self):
        """
        Copies over crispr_readme.txt

        """
        crispr_readme = os.path.join(os.path.dirname(__file__),
                                     'crispr_readme.txt')

        tokenmap = self._create_token_replacement_map()
        result_readme = os.path.join(self._outdir, 'readme.txt')
        with open(result_readme, 'w') as fout:
            fout.write(self._dataset + ' ' +
                       self._get_dataset_description() +
                       '\n\n')
            with open(crispr_readme, 'r') as f:
                for line in f:
                    line_to_write = self._replace_readme_tokens(line,
                                                                tokenmap=tokenmap)
                    fout.write(line_to_write)

    def _replace_readme_tokens(self, line, tokenmap=None):
        """
        Replaces tokens in a line of the CRISPR readme file with their corresponding values from the token
        replacement map.

        :param line: The current line from the readme file to process.
        :type line: str
        :param tokenmap: A map of tokens and their replacement values.
        :type tokenmap: dict
        :return: The line with tokens replaced by their respective values.
        :rtype: str
        """
        for token in tokenmap.keys():
            if token in line:
                return line.replace(token, tokenmap[token])
        return line

    def _generate_rocrate_dir_path(self):
        """
        Generates the directory path for the RO-Crate based on project name, gene set, cell line, treatment type,
        tissue, dataset type, and release version.
        """
        dir_name = self._project_name.lower() + '_'
        if self._gene_set is not None:
            dir_name += self._gene_set.lower() + '_'
        dir_name += self._cell_line.lower() + '_'
        dir_name += re.sub(r'[^a-zA-Z0-9\w\n\.]', '_', self._tissue.lower()) + '_'
        dir_name += self._treatment.lower() + '_crispr_'
        if self._dataset.lower() != '':
            dir_name += self._dataset.lower() + '_'
        dir_name += self._release.lower()

        dir_name = dir_name.replace(' ', '_')
        self._outdir = os.path.join(self._outdir, dir_name)

    def _register_computation(self, generated_dataset_ids=[],
                              description='',
                              keywords=[]):
        """
        Registers the computation.
        # Todo: added in used dataset, software and what is being generated
        :return:
        """
        logger.debug('Getting id of input rocrate')
        comp_keywords = keywords.copy()
        comp_keywords.extend(['computation'])
        description = description + ' run of ' + cellmaps_utils.__name__
        self._provenance_utils.register_computation(self._outdir,
                                                    name='CRISPR',
                                                    run_by=str(self._provenance_utils.get_login()),
                                                    command=str(self._input_data_dict),
                                                    description=description,
                                                    keywords=comp_keywords,
                                                    used_software=[self._softwareid],
                                                    generated=generated_dataset_ids,
                                                    guid=self._get_fairscape_id())

    def _register_software(self, description='',
                           keywords=[]):
        """
        Registers this tool

        :raises CellMapsImageEmbeddingError: If fairscape call fails
        """
        software_keywords = keywords.copy()
        software_keywords.extend(['tools', cellmaps_utils.__name__])
        software_description = description + ' ' + \
                               cellmaps_utils.__description__
        self._softwareid = self._provenance_utils.register_software(self._outdir,
                                                                    name=cellmaps_utils.__name__,
                                                                    description=software_description,
                                                                    author=cellmaps_utils.__author__,
                                                                    version=cellmaps_utils.__version__,
                                                                    file_format='py',
                                                                    keywords=software_keywords,
                                                                    url=cellmaps_utils.__repo_url__,
                                                                    guid=self._get_fairscape_id())


[docs]
    def add_subparser(subparsers):
        """
        Adds a subparser for the CRISPR data loader command.

        :return:
        """
        desc = """

        Version {version}

        {cmd} Loads CRISPR data into a RO-Crate by creating a
        directory, copying over relevant and using FAIRSCAPE CLI to
        register the data files in the directory known as an RO-Crate

        """.format(version=cellmaps_utils.__version__,
                   cmd=CRISPRDataLoader.COMMAND)

        parser = subparsers.add_parser(CRISPRDataLoader.COMMAND,
                                       help='Loads CRISPR data into a RO-Crate',
                                       description=desc,
                                       formatter_class=constants.ArgParseFormatter)
        parser.add_argument('outdir',
                            help='Directory where RO-Crate will be created')
        parser.add_argument('--skipcopy', action='store_true',
                            help='If set, --h5ad file will not be copied, '
                                 'but instead a 0 byte file will be placed in the RO-Crate as a placeholder. '
                                 'It is up to the caller to manually move/copy the files over before distribution')
        parser.add_argument('--h5ad', required=True,
                            help='Path to h5ad file')
        parser.add_argument('--author', default='Mali Lab',
                            help='Author that created this data')
        parser.add_argument('--name', default='CRISPR',
                            help='Name of this run, needed for FAIRSCAPE')
        parser.add_argument('--organization_name', default='Mali Lab',
                            help='Name of organization running this tool, needed '
                                 'for FAIRSCAPE. Usually set to lab')
        parser.add_argument('--project_name', default='CM4AI',
                            help='Name of project running this tool, '
                                 'needed for FAIRSCAPE. Usually set to '
                                 'funding source')
        parser.add_argument('--release', required=True,
                            help='Version of release. '
                                 'For example: 0.1 alpha')
        parser.add_argument('--treatment', default='untreated',
                            choices=['paclitaxel', 'vorinostat', 'untreated'],
                            help='Treatment of sample.')
        parser.add_argument('--dataset', required=True,
                            choices=['1channel', 'subset', '4channel'],
                            help='Collection set')
        parser.add_argument('--cell_line', default='MDA-MB-468',
                            choices=['MDA-MB-468', 'KOLF2.1J'],
                            help='Name of cell line. For example MDA-MB-468')
        parser.add_argument('--gene_set', choices=['chromatin', 'metabolic'],
                            default='chromatin',
                            help='Gene set for dataset')
        parser.add_argument('--tissue', choices=['undifferentiated', 'neuron',
                                                 'cardiomyocytes', ''],
                            default='breast; mammary gland',
                            help='Tissue for dataset. Since the default --cell_line '
                                 'is MDA-MB-468, this value is set to the tissue '
                                 'for that cell line')
        parser.add_argument('--num_perturb_guides', default='6',
                            help='Number of guides per perturbation')
        parser.add_argument('--num_non_target_ctrls', default='109',
                            help='Number of non targeting controls')
        parser.add_argument('--num_screen_targets', default='108',
                            help='Number of screen targets')
        return parser