Source code for cellmaps_utils.apmstool

import json
import os
import sys
import uuid
import shutil
from datetime import date
import logging
import pandas as pd
import cellmaps_utils
from cellmaps_utils.basecmdtool import BaseCommandLineTool
from cellmaps_utils.exceptions import CellMapsError
from cellmaps_utils import constants
from cellmaps_utils.provenance import ProvenanceUtil

logger = logging.getLogger(__name__)


[docs] class APMSDataLoader(BaseCommandLineTool): """ Creates RO-Crate of AP-MS data from raw AP-MS tables """ COMMAND = 'apmsconverter' BAIT_COL_NAME = 'Bait' def __init__(self, theargs, provenance_utils=ProvenanceUtil()): """ Constructor :param theargs: Command line arguments that at minimum need to have the following attributes: :type theargs: :py:class:`~python.argparse.Namespace` """ super().__init__() self._outdir = os.path.abspath(theargs.outdir) self._inputs = theargs.inputs self._name = theargs.name self._organization_name = theargs.organization_name self._project_name = theargs.project_name self._release = theargs.release self._cell_line = theargs.cell_line self._treatment = theargs.treatment self._tissue = theargs.tissue self._author = theargs.author self._gene_set = theargs.gene_set self._baitcolname = theargs.baitcolname self._set_name = theargs.set_name self._provenance_utils = provenance_utils self._softwareid = None self._input_data_dict = theargs.__dict__
[docs] def run(self): """ Run method to create RO-Crate from AP-MS data tables. This process involves merging input tables, registering the dataset and related software in the RO-Crate. :return: """ self._generate_rocrate_dir_path() if os.path.exists(self._outdir): raise CellMapsError(self._outdir + ' already exists') logger.debug('Creating directory ' + str(self._outdir)) os.makedirs(self._outdir, mode=0o755) keywords = [self._project_name, self._release, self._cell_line, self._treatment, self._tissue, 'AP-MS edgelist'] if self._gene_set is not None: keywords.append(self._gene_set) description = ' '.join(keywords) info_dict = { constants.DATASET_NAME: self._name, constants.DATASET_ORGANIZATION_NAME: self._organization_name, constants.DATASET_PROJECT_NAME: self._project_name, constants.DATASET_RELEASE: self._release, constants.DATASET_CELL_LINE: self._cell_line, constants.DATASET_TREATMENT: self._treatment, constants.DATASET_TISSUE: self._tissue, constants.DATASET_AUTHOR: self._author, constants.DATASET_GENE_SET: self._gene_set } self.save_dataset_info_to_json(self._outdir, info_dict, constants.DATASET_INFO_FILE) self._provenance_utils.register_rocrate(self._outdir, name=self._name, organization_name=self._organization_name, project_name=self._project_name, description=description, keywords=keywords, guid=self._get_fairscape_id()) gen_dsets = [] file_path = self._merge_and_save_apms_data() file_desc = description + ' AP-MS file' file_keywords = keywords.copy() file_keywords.extend(['file']) dset_id = self._provenance_utils.register_dataset(rocrate_path=self._outdir, source_file=file_path, skip_copy=True, data_dict={'name': ' AP-MS file', 'description': file_desc, 'keywords': file_keywords, 'data-format': 'tsv', 'author': self._author, 'version': self._release, 'date-published': date.today().strftime('%Y-%m-%d')}, guid=self._get_fairscape_id()) gen_dsets.append(dset_id) self._register_software(keywords=keywords, description=description) self._register_computation(generated_dataset_ids=gen_dsets, description=description, keywords=keywords) self._copy_over_apms_readme() return 0
def _get_fairscape_id(self): """ Creates a unique id :return: """ return str(uuid.uuid4()) + ':' + os.path.basename(self._outdir) def _copy_over_apms_readme(self): """ Copies over apms_readme.txt """ apms_readme = os.path.join(os.path.dirname(__file__), 'apms_readme.txt') shutil.copy(apms_readme, os.path.join(self._outdir, 'readme.txt')) def _generate_rocrate_dir_path(self): """ Generates a directory path for the RO-Crate based on provided metadata like project name, gene set, cell line, treatment, and release version. """ dir_name = self._project_name.lower() + '_' if self._gene_set is not None: dir_name += self._gene_set.lower() + '_' dir_name += self._cell_line.lower() + '_' dir_name += self._treatment.lower() if self._set_name is not None: dir_name += '_' + self._set_name dir_name += '_apms_' dir_name += self._release.lower() dir_name = dir_name.replace(' ', '_') self._outdir = os.path.join(self._outdir, dir_name) def _merge_and_save_apms_data(self): """ Merges AP-MS data from input files into a single DataFrame and saves the combined data to a TSV file within the RO-Crate directory. :return: The file path to the saved AP-MS data TSV file. """ df_list = [] for input in self._inputs: thesep = '\t' if input.endswith('.csv'): thesep = ',' cur_df = pd.read_csv(input, sep=thesep, na_filter=False) if self._baitcolname != 'Bait': if self._baitcolname in cur_df.columns: cur_df.rename({self._baitcolname: 'Bait'}, axis=1, inplace=True) # Handles case where HDAC2 in initial cm4ai dataset # had several columns lacking .x suffix # we are fixing this by checking for those columns and if # found just renaming them in place for colname in ['PreyGene', 'NumReplicates', 'AvgP', 'MaxP', 'TopoAvgP', 'TopoMaxP', 'SaintScore', 'FoldChange', 'BFDR', 'boosted_by']: if colname in cur_df.columns: if colname + '.x' in cur_df.columns: cur_df.drop(columns=colname + '.x', inplace=True) cur_df.rename({colname: colname + '.x'}, axis=1, inplace=True) df_list.append(cur_df) df = pd.concat(df_list) apms_path = os.path.join(self._outdir, constants.APMS_TSV_FILE) df.to_csv(apms_path, sep='\t', index=False) return apms_path def _register_computation(self, generated_dataset_ids=[], description='', keywords=[]): """ Registers the computation process in the RO-Crate # Todo: added in used dataset, software and what is being generated :return: """ logger.debug('Getting id of input rocrate') comp_keywords = keywords.copy() comp_keywords.extend(['computation']) description = description + ' run of ' + cellmaps_utils.__name__ self._provenance_utils.register_computation(self._outdir, name='AP-MS', run_by=str(self._provenance_utils.get_login()), command=str(self._input_data_dict), description=description, keywords=comp_keywords, used_software=[self._softwareid], generated=generated_dataset_ids, guid=self._get_fairscape_id()) def _register_software(self, description='', keywords=[]): """ Registers this tool :raises CellMapsImageEmbeddingError: If fairscape call fails """ software_keywords = keywords.copy() software_keywords.extend(['tools', cellmaps_utils.__name__]) software_description = description + ' ' + \ cellmaps_utils.__description__ self._softwareid = self._provenance_utils.register_software(self._outdir, name=cellmaps_utils.__name__, description=software_description, author=cellmaps_utils.__author__, version=cellmaps_utils.__version__, file_format='py', keywords=software_keywords, url=cellmaps_utils.__repo_url__, guid=self._get_fairscape_id())
[docs] def add_subparser(subparsers): """ Adds a command-line subparser for the APMSDataLoader tool. :return: """ desc = """ Version {version} {cmd} Loads AP-MS data into a RO-Crate """.format(version=cellmaps_utils.__version__, cmd=APMSDataLoader.COMMAND) parser = subparsers.add_parser(APMSDataLoader.COMMAND, help='Loads AP-MS data into a RO-Crate', description=desc, formatter_class=constants.ArgParseFormatter) parser.add_argument('outdir', help='Directory where RO-Crate will be created') parser.add_argument('--inputs', required=True, nargs="+", help='One or more table files with the following ' 'fields: [Bait, Prey] and for filtering also ' 'containing [BFDR.x, logOddsScore') parser.add_argument('--author', default='Krogan Lab', help='Author that created this data') parser.add_argument('--name', default='AP-MS', help='Name of this run, needed for FAIRSCAPE') parser.add_argument('--organization_name', default='Krogan Lab', help='Name of organization running this tool, needed ' 'for FAIRSCAPE. Usually set to lab') parser.add_argument('--project_name', default='CM4AI', help='Name of project running this tool, needed for ' 'FAIRSCAPE. Usually set to funding source') parser.add_argument('--release', required=True, help='Version of release. For example: 0.1 alpha') parser.add_argument('--treatment', default='untreated', choices=['paclitaxel', 'vorinostat', 'untreated'], help='Treatment of sample.') parser.add_argument('--cell_line', default='MDA-MB-468', help='Name of cell line. For example MDA-MB-468') parser.add_argument('--gene_set', choices=['chromatin', 'metabolic'], default='chromatin', help='Gene set for dataset') parser.add_argument('--set_name', help='If set, adds value to RO-Crate folder name before _apms_<version>. ' 'Example values set1') parser.add_argument('--tissue', choices=['undifferentiated', 'neuron', 'cardiomyocytes', ''], default='breast; mammary gland', help='Tissue for dataset. Since the default --cell_line ' 'is MDA-MB-468, this value is set to the tissue ' 'for that cell line') parser.add_argument('--baitcolname', default='Bait', help='Name of bait column in input file(s)') return parser