Source code for cellmaps_utils.tabletool

import os
import tempfile
import tarfile
import shutil
from datetime import date
import logging
import csv
import cellmaps_utils
from cellmaps_utils.basecmdtool import BaseCommandLineTool
from cellmaps_utils.exceptions import CellMapsError
from cellmaps_utils import constants
from cellmaps_utils.provenance import ProvenanceUtil

logger = logging.getLogger(__name__)


[docs] class TableFromROCrates(BaseCommandLineTool): """ Creates table of meta data and links from one or more RO-Crates """ COMMAND = 'rocratetable' ID_COL = 'FAIRSCAPE ARK ID' DATE_COL = 'Date' COMPUTATION_COL = 'Name' DESCRIPTION_COL = 'Description' KEYWORDS_COL = 'Keywords' DOWNLOAD_COL = 'Download RO-Crate Data Package' DOWNLOAD_COL_SIZE = 'Download RO-Crate Data Package Size MB' GENERATED_COL = 'Generated By Software' OUTPUT_COL = 'Output Dataset' RESPONSIBLE_COL = 'Responsible Lab' TYPE_COL = 'Type' CELL_LINE_COL = 'Cell Line' TREATMENT_COL = 'Treatment' TISSUE_COL = 'Tissue' GENESET_COL = 'Gene set' VERSION_COL = 'Version' DATA_ROCRATE = 'Data' MODEL_ROCRATE = 'Model' INTERMEDIATE_ROCRATE = 'Intermediate' OTHER_ROCRATE = 'Other' COLUMNS = [ID_COL, DATE_COL, VERSION_COL, TYPE_COL, CELL_LINE_COL, TISSUE_COL, TREATMENT_COL, GENESET_COL, GENERATED_COL, COMPUTATION_COL, DESCRIPTION_COL, KEYWORDS_COL, DOWNLOAD_COL, DOWNLOAD_COL_SIZE, GENERATED_COL, OUTPUT_COL, RESPONSIBLE_COL] def __init__(self, theargs, provenance_utils=ProvenanceUtil()): """ Constructor :param theargs: Command line arguments that at minimum need to have the following attributes: :type theargs: :py:class:`~python.argparse.Namespace` """ super().__init__() self._outdir = os.path.abspath(theargs.outdir) self._rocrates = theargs.rocrates self._date = theargs.date self._version = theargs.release self._provenance_utils = provenance_utils self._downloadurlprefix = theargs.downloadurlprefix self._input_data_dict = theargs.__dict__ def _get_rocrate_type(self, compname): """ Gets type of rocrate, which can be Data, Model, or Intermediate :return: :rtype: str """ if compname in ['IF images', 'AP-MS', 'CRISPR']: return TableFromROCrates.DATA_ROCRATE if compname in ['Hierarchy']: return TableFromROCrates.MODEL_ROCRATE return TableFromROCrates.INTERMEDIATE_ROCRATE def _get_cell_line(self, keywords): """ Looks for cell line in keywords :param keywords: :type keywords: list :return: :rtype: str """ if 'MDA-MB-468' in keywords: return 'MDA-MB-468' if 'KOLF2.1J' in keywords: return 'KOLF2.1J' return 'Unknown' def _get_treatment(self, keywords): """ Gets treatment from keywords :param keywords: :return: """ treatments = [] if 'untreated' in keywords: treatments.append('untreated') if 'vorinostat' in keywords: treatments.append('vorinostat') if 'paclitaxel' in keywords: treatments.append('paclitaxel') return ','.join(treatments) def _get_tissue(self, keywords): """ Gets tissue from keywords :param keywords: :return: """ if 'undifferentiated' in keywords: return 'undifferentiated' if 'neuron' in keywords: return 'neuron' if 'cardiomyocytes' in keywords: return 'cardiomyocytes' if 'breast; mammary gland' in keywords: return 'breast; mammary gland' return '' def _get_geneset(self, keywords): """ Gets treatment from keywords :param keywords: :return: """ genesets = [] if 'chromatin' in keywords: genesets.append('chromatin') if 'metabolic' in keywords: genesets.append('metabolic') if len(genesets) == 0: return 'Unknown' return ','.join(genesets)
[docs] def run(self): """ Runs the process of creation a metadata table from one or more RO-Crates. This method iterates through each RO-Crate, collects metadata and provenance information, and writes it into a tab-separated values (TSV) file. :return: """ if os.path.exists(self._outdir): raise CellMapsError(self._outdir + ' already exists') logger.debug('Creating directory ' + str(self._outdir)) os.makedirs(self._outdir, mode=0o755) table_file = os.path.join(self._outdir, 'data.tsv') with open(table_file, 'w', newline='') as f: writer = csv.DictWriter(f, delimiter='\t', fieldnames=TableFromROCrates.COLUMNS) writer.writeheader() for rocrate in self._rocrates: rocrate_dict = self._get_rocrate_as_dict(rocrate) urlfile = os.path.basename(rocrate) if not rocrate.endswith('.tar.gz'): urlfile += '.tar.gz' prov_attrs = self._provenance_utils.get_rocrate_provenance_attributes(rocrate=rocrate_dict) gen_col_value = self._get_software_url(rocrate_dict=rocrate_dict) if gen_col_value == 'cellmaps_utils': gen_col_value = '- ' + prov_attrs.get_organization_name() + ' -' output_url = self._get_output_dataset_url(rocrate_dict=rocrate_dict) comp_name = self._get_computation_name(rocrate_dict=rocrate_dict) if comp_name is None: comp_name = prov_attrs.get_name() logger.debug(rocrate + ': ' + 'Keywords: ' + str(prov_attrs.get_keywords())) row = {TableFromROCrates.ID_COL: self._provenance_utils.get_id_of_rocrate(rocrate=rocrate_dict), TableFromROCrates.DATE_COL: self._date, TableFromROCrates.VERSION_COL: self._version, TableFromROCrates.TYPE_COL: self._get_rocrate_type(comp_name), TableFromROCrates.CELL_LINE_COL: self._get_cell_line(prov_attrs.get_keywords()), TableFromROCrates.TISSUE_COL: self._get_tissue(prov_attrs.get_keywords()), TableFromROCrates.TREATMENT_COL: self._get_treatment(prov_attrs.get_keywords()), TableFromROCrates.GENESET_COL: self._get_geneset(prov_attrs.get_keywords()), TableFromROCrates.COMPUTATION_COL: comp_name, TableFromROCrates.DESCRIPTION_COL: prov_attrs.get_description(), TableFromROCrates.KEYWORDS_COL: ','.join(prov_attrs.get_keywords()), TableFromROCrates.DOWNLOAD_COL: self._get_rocrate_download_link(urlfile), TableFromROCrates.DOWNLOAD_COL_SIZE: self._get_rocrate_size(rocrate), TableFromROCrates.GENERATED_COL: gen_col_value, TableFromROCrates.OUTPUT_COL: output_url, TableFromROCrates.RESPONSIBLE_COL: prov_attrs.get_organization_name()} writer.writerow(row) return 0
def _get_rocrate_size(self, rocrate_path): """ Determines the size of the RO-Crate, either directly or by inspecting its tar.gz archive. :param rocrate_path: The file path to the RO-Crate or its tar.gz archive. :return: """ if os.path.isfile(rocrate_path): return str(max(round(os.path.getsize(rocrate_path)/1048576.0), 1)) targz_appended = rocrate_path + '.tar.gz' if os.path.isfile(targz_appended): return self._get_rocrate_size(rocrate_path + '.tar.gz') return '?' def _get_rocrate_download_link(self, urlfile): """ Constructs a download link for the RO-Crate. :param urlfile: The base name of the RO-Crate file for which to construct the download link. :return: The full URL to download the RO-Crate """ return self._downloadurlprefix + urlfile def _get_software_url(self, rocrate_dict=None): """ Retrieves the URL of the software used to generate the RO-Crate :param rocrate_dict: The RO-Crate metadata as a dictionary. :return: The URL of the software or its name if generated by cellmaps_utils. """ for entry in self._get_next_software_from_rocrate_dict(rocrate_dict=rocrate_dict): if entry['name'] == 'cellmaps_utils': return entry['name'] # this creates html <a href fragment # return '<a href="' + entry['url'] + '" target="_blank">' + entry['name'] + '</a>' return entry['url'] def _get_next_software_from_rocrate_dict(self, rocrate_dict=None): """ Generator that gets next software from **rocrate** dict passed in :param rocrate_dict: :type rocrate_dict: dict :return: next software section of RO-CRATE as dictionary :type: dict """ if not '@graph' in rocrate_dict: raise CellMapsError('No @graph, but found: ' + str(rocrate_dict.keys())) for graph_entry in rocrate_dict['@graph']: if 'metadataType' not in graph_entry: continue if 'EVI#Software' in graph_entry['metadataType']: yield graph_entry def _get_next_computation_from_rocrate_dict(self, rocrate_dict=None): """ Generator that gets next software from **rocrate** dict passed in :param rocrate_dict: :type rocrate_dict: dict :return: next software section of RO-CRATE as dictionary :type: dict """ # Todo refactor this and _get_next_software_from_rocrate_dict because # they basically do the same thing if not '@graph' in rocrate_dict: raise CellMapsError('No @graph, but found: ' + str(rocrate_dict.keys())) for graph_entry in rocrate_dict['@graph']: if 'metadataType' not in graph_entry: continue if 'EVI#Computation' in graph_entry['metadataType']: yield graph_entry def _get_computation_name(self, rocrate_dict=None): """ Extracts the computation name from the RO-Crate metadata :param rocrate_dict: The RO-Crate metadata as a dictionary. :return: The name of the computation, or None. """ for entry in self._get_next_computation_from_rocrate_dict(rocrate_dict=rocrate_dict): if ' computation' in entry['name']: return None return entry['name'] def _get_next_output_dataset_url(self, rocrate_dict=None, name='Output Dataset'): """ Generator that gets url from next Dataset whose name is **name** :param rocrate_dict: :return: """ if not '@graph' in rocrate_dict: raise CellMapsError('No @graph, but found: ' + str(rocrate_dict.keys())) for graph_entry in rocrate_dict['@graph']: if 'name' not in graph_entry: continue if graph_entry['name'] == name: if 'url' in graph_entry: yield graph_entry['url'] def _get_output_dataset_url(self, rocrate_dict=None): """ Retrieves the URL of the output dataset from the RO-Crate. :param rocrate_dict: The RO-Crate as a dictionary. :return: The URL of the output dataset or an empty string if not found. """ for entry in self._get_next_output_dataset_url(rocrate_dict=rocrate_dict): if entry is None: return '' return entry def _get_rocrate_as_dict(self, rocrate=None): """ Gets ro-crate-metadata.json as a dict from :param rocrate: Path to directory, ro-crate meta data file, tar file, or tar.gz file :type rocrate: str :return: """ if os.path.isdir(rocrate): return self._provenance_utils.get_rocrate_as_dict(rocrate_path=rocrate) if not os.path.isfile(rocrate): raise CellMapsError('Invalid rocrate: ' + str(rocrate)) if rocrate.endswith('.tar') or rocrate.endswith('.tar.gz') or rocrate.endswith('.tgz'): return self._get_rocrate_as_dict_from_tarball(rocrate=rocrate) return self._provenance_utils.get_rocrate_as_dict(rocrate_path=rocrate) def _get_rocrate_as_dict_from_tarball(self, rocrate=None): """ Extracts the RO-Crate metadata from a tarball (tar.gz) and returns it as a dictionary. :param rocrate: Path to the tarball containing the RO-Crate. :return: The RO-Crate as a dictionary. """ with tarfile.open(rocrate) as tar: for ti in tar: split_path = ti.name.split('/') if len(split_path) != 2: continue if split_path[1] == 'ro-crate-metadata.json': temp_dir = tempfile.mkdtemp() try: tar.extract(ti.name, path=temp_dir) return self._provenance_utils.get_rocrate_as_dict(rocrate_path=os.path.join(temp_dir, ti.name)) finally: shutil.rmtree(temp_dir)
[docs] @staticmethod def add_subparser(subparsers): """ Adds the command-line subparser for the TableFromROCrates tool :return: """ desc = """ Version {version} {cmd} Creates meta data Table from one or more RO-Crates """.format(version=cellmaps_utils.__version__, cmd=TableFromROCrates.COMMAND) parser = subparsers.add_parser(TableFromROCrates.COMMAND, help='Creates metadata table from RO-Crates', description=desc, formatter_class=constants.ArgParseFormatter) parser.add_argument('outdir', help='Directory metadata tables will be created') parser.add_argument('--rocrates', nargs='+', help='Path to RO-Crates used for table generation. ') parser.add_argument('--date', default=date.today().strftime('%Y-%m-%d'), help='Date to list in table for RO-Crates') parser.add_argument('--release', default='0.1 alpha', help='Version of release') parser.add_argument('--downloadurlprefix', default='https://g-9b3b6e.9ad93.a567.data.globus.org/Data/cm4ai_0.1alpha/', help='URL prefix for downloads. MUST END WITH /') return parser