Source code for mh_utils.csv_parser.classes

#!/usr/bin/env python3
#
#  classes.py
"""
Classes to model parts of MassHunter CSV files.

.. versionadded:: 0.2.0
"""
#
#  Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
from collections import OrderedDict
from decimal import Decimal
from typing import Dict, Iterable, List, Optional, Tuple, Type, TypeVar

# 3rd party
import numpy
import pandas  # type: ignore
import sdjson
from cawdrey import AlphaDict
from domdf_python_tools import doctools
from domdf_python_tools.bases import Dictable
from domdf_python_tools.doctools import prettify_docstrings
from domdf_python_tools.paths import PathPlus
from domdf_python_tools.typing import PathLike

__all__ = [
		"Sample",
		"Result",
		"SampleList",
		"BaseSamplePropertyDict",
		"SamplesAreaDict",
		"SamplesScoresDict",
		"encode_result_or_sample",
		"encode_set",
		"encode_decimal",
		"_S",
		"_SL",
		"_R",
		]

pandas.Series.__module_ = "pandas"

_S = TypeVar("_S", bound="Sample")
_SL = TypeVar("_SL", bound="SampleList")
_R = TypeVar("_R", bound="Result")


[docs]@prettify_docstrings class Sample(Dictable): """ Represents a sample in a MassHunter CSV file. :param sample_name: :param sample_type: :param instrument_name: :param position: :param user: :param acq_method: :param da_method: :param irm_cal_status: :param filename: :param results: """ def __init__( self, sample_name, sample_type, instrument_name, position, user, acq_method, da_method, irm_cal_status, filename, results=None, ): super().__init__() self.sample_name = sample_name self.sample_type = sample_type self.instrument_name = instrument_name self.position = position self.user = user self.acq_method = acq_method self.da_method = da_method self.irm_cal_status = irm_cal_status self.filename = filename self._results: Dict[float, Result] if results is None: self._results = {} elif isinstance(results, dict): self._results = {} for cpd_no, compound in results.items(): if isinstance(compound, dict): self._results[cpd_no] = Result(**compound) else: self._results[cpd_no] = compound elif isinstance(results, list): self._results = {} for compound in results: if isinstance(compound, dict): tmp_result = Result(**compound) cpd_no = tmp_result.index self._results[cpd_no] = tmp_result else: self._results[compound.index] = compound else: raise TypeError(f"Unknown type for `results`: {type(results)}")
[docs] def add_result(self, result): """ Add a result to the sample. :param result: """ self._results[result.index] = result
@property def results_list(self) -> List["Result"]: """ Returns a list of results in the order in which they were identified. I.e. sorted by the ``Cpd`` value from the csv export. :rtype: .. clearpage:: """ results_list = [] for key in sorted(self._results.keys()): results_list.append(self._results[key]) return results_list def __eq__(self, other): if isinstance(other, self.__class__): return ( self.sample_name == other.sample_name and self.sample_type == other.sample_type and self.filename == other.filename and self.acq_method == other.acq_method )
[docs] @classmethod def from_series(cls: Type[_S], series) -> _S: """ Constuct a :class:`~.Sample` from a :class:`pandas.Series`. :param series: :return: """ sample_name = series["Sample Name"] sample_type = series["Sample Type"] filename = series["File"] instrument_name = series["Instrument Name"] position = series["Position"] user = series["User Name"] acq_method = series["Acq Method"] da_method = series["DA Method"] irm_cal_status = series["IRM Calibration status"] return cls( sample_name, sample_type, instrument_name, position, user, acq_method, da_method, irm_cal_status, filename, )
def __repr__(self): return f"Sample({self.sample_name})" @property def __dict__(self): return AlphaDict( sample_name=self.sample_name, sample_type=self.sample_type, instrument_name=self.instrument_name, position=self.position, user=self.user, acq_method=self.acq_method, da_method=self.da_method, irm_cal_status=self.irm_cal_status, filename=self.filename, results=self.results_list )
[docs]@prettify_docstrings class Result(Dictable): r""" Represents a Result in a MassHunter CSV file. .. raw:: latex \begin{multicols}{2} :param cas: :param name: :param hits: :param index: :param formula: :param score: :param abundance: :param height: :param area: :param diff_mDa: :param diff_ppm: :param rt: :param start: :param end: :param width: :param tgt_rt: :param rt_diff: :param mz: :param product_mz: :param base_peak: :param mass: :param average_mass: :param tgt_mass: :param mining_algorithm: :param z_count: :param max_z: :param min_z: :param n_ions: :param polarity: :param label: :param flags: :param flag_severity: :param flag_severity_code: .. raw:: latex \end{multicols} """ def __init__( self, cas, name: str, hits, index: int = -1, formula: str = '', score: float = 0.0, abundance: float = 0, height: float = 0, area: float = 0, diff_mDa: float = 0.0, diff_ppm: float = 0.0, rt: float = 0.0, start: float = 0.0, end: float = 0.0, width: float = 0.0, tgt_rt: float = 0.0, rt_diff: float = 0.0, mz: float = 0.0, product_mz: float = 0.0, base_peak: float = 0.0, mass: float = 0.0, average_mass: float = 0.0, tgt_mass: float = 0.0, mining_algorithm: str = '', z_count: int = 0, max_z: int = 0, min_z: int = 0, n_ions: int = 0, polarity: str = '', label: str = '', flags: str = '', flag_severity: str = '', flag_severity_code: int = 0, ): super().__init__() # Possible also AL (ID Source) and AM (ID Techniques Applied) self._cas = cas self.name: str = str(name) self.hits = hits self.formula: str = str(formula) self.score: Decimal = Decimal(score) self.abundance: float = int(abundance) self.height: float = int(height) self.area: float = int(area) self.diff_mDa: Decimal = Decimal(diff_mDa) self.diff_ppm: Decimal = Decimal(diff_ppm) self.rt: Decimal = Decimal(rt) self.start: Decimal = Decimal(start) self.end: Decimal = Decimal(end) self.width: Decimal = Decimal(width) self.tgt_rt: Decimal = Decimal(tgt_rt) self.rt_diff: Decimal = Decimal(rt_diff) self.mz: Decimal = Decimal(mz) self.product_mz: Decimal = Decimal(product_mz) self.base_peak: Decimal = Decimal(base_peak) self.mass: Decimal = Decimal(mass) self.average_mass: Decimal = Decimal(average_mass) self.tgt_mass: Decimal = Decimal(tgt_mass) self.mining_algorithm: str = str(mining_algorithm) self.z_count: int = int(z_count) self.max_z: int = int(max_z) self.min_z: int = int(min_z) self.n_ions: int = int(n_ions) self.polarity: str = str(polarity) self.label: str = str(label) self.flags: str = str(flags) self.flag_severity: str = str(flag_severity) self.flag_severity_code: int = int(flag_severity_code) self.index: int = index # Tracks the number of the result in the sample # "Score (Tgt)",
[docs] @classmethod def from_series(cls: Type[_R], series: pandas.Series) -> _R: """ Consruct a :class:`~.classes.Result` from a :class:`pandas.Series`. :param series: :rtype: .. clearpage:: """ cas = series["CAS"] name = series["Name"] index = series["Cpd"] hits = series["Hits"] formula = series["Formula"] score = series["Score"] abundance = series["Abund"] height = series["Height"] area = series["Area"] diff_mDa = series["Diff (Tgt, mDa)"] diff_ppm = series["Diff (Tgt, ppm)"] rt = series["RT"] start = series["Start"] end = series["End"] width = series["Width"] tgt_rt = series["RT (Tgt)"] rt_diff = series["RT Diff (Tgt)"] mz = series["m/z"] product_mz = series["m/z (prod.)"] base_peak = series["Base Peak"] mass = series["Mass"] average_mass = series["Avg Mass"] tgt_mass = series["Mass (Tgt)"] mining_algorithm = series["Mining Algorithm"] z_count = series["Z Count"] max_z = series["Max Z"] min_z = series["Min Z"] n_ions = series["Ions"] polarity = series["Polarity"] label = series["Label"] flags = series["Flags (Tgt)"] flag_severity = series["Flag Severity (Tgt)"] flag_severity_code = series["Flag Severity Code (Tgt)"] return cls( cas, name, hits, index, formula, score, abundance, height, area, diff_mDa, diff_ppm, rt, start, end, width, tgt_rt, rt_diff, mz, product_mz, base_peak, mass, average_mass, tgt_mass, mining_algorithm, z_count, max_z, min_z, n_ions, polarity, label, flags, flag_severity, flag_severity_code, )
def __repr__(self): return f"Result({self.name}; {self.formula}; {self.rt}; {self.score})" @property def __dict__(self): return AlphaDict( cas=self._cas, name=self.name, hits=self.hits, formula=self.formula, score=self.score, abundance=self.abundance, height=self.height, area=self.area, diff_mDa=self.diff_mDa, diff_ppm=self.diff_ppm, rt=self.rt, start=self.start, end=self.end, width=self.width, tgt_rt=self.tgt_rt, rt_diff=self.rt_diff, mz=self.mz, product_mz=self.product_mz, base_peak=self.base_peak, mass=self.mass, average_mass=self.average_mass, tgt_mass=self.tgt_mass, mining_algorithm=self.mining_algorithm, z_count=self.z_count, max_z=self.max_z, min_z=self.min_z, n_ions=self.n_ions, polarity=self.polarity, label=self.label, flags=self.flags, flag_severity=self.flag_severity, flag_severity_code=self.flag_severity_code, index=self.index, ) def __eq__(self, other): if isinstance(other, str): return other.casefold() == self.name.casefold() else: return NotImplemented
[docs]class SampleList(List[Sample]): """ A list of :class:`mh_utils.csv_parser.classes.Sample` objects. """
[docs] @doctools.append_docstring_from(Sample.__init__) def add_new_sample(self, *args, **kwargs): """ Add a new sample to the list and return the :class:`~classes.Sample` object representing it. """ # noqa: D400 tmp_sample = Sample(*args, **kwargs) return self.add_sample(tmp_sample)
[docs] def add_sample(self, sample: Sample) -> Sample: """ Add a :class:`~.Sample` object to the list. :param sample: :rtype: .. clearpage:: """ if sample in self: return self[self.index(sample)] else: self.append(sample) return sample
# def find_sample(self, sample_name: str) -> Optional[Sample]: # if sample_name in self: # return self[self.index(sample_name)] # else: # return None
[docs] def add_sample_from_series(self, series: pandas.Series) -> Sample: """ Create a new sample object from a :class:`pandas.series` and add it to the list. :returns: The newly created :class:`~classes.Sample` object. :param series: """ tmp_sample = Sample.from_series(series) return self.add_sample(tmp_sample)
[docs] def sort_samples(self, key: str, reverse: bool = False): """ Sort the list of :class:`~.Samples` in place. :param key: The name of the property in the sample to sort by. :param reverse: Whether the list should be sorted in reverse order. :rtype: .. clearpage:: """ self.sort(key=lambda samp: getattr(samp, key), reverse=reverse)
[docs] def reorder_samples(self, order_mapping: Dict, key: str = "sample_name"): """ Reorder the list of :class:`~.Samples` in place. :param order_mapping: A mapping between sample names and their new position in the list. For example: .. code-block:: python order_mapping = { "Propellant 1ug +ve": 0, "Propellant 1mg +ve": 1, "Propellant 1ug -ve": 2, "Propellant 1mg -ve": 3, } :param key: The name of the property in the sample to sort by. """ self.sort(key=lambda s: order_mapping[getattr(s, key)], reverse=True)
[docs] def rename_samples(self, rename_mapping: Dict, key: str = "sample_name"): r""" Rename the samples in the list. :param rename_mapping: A mapping between current sample names and their new names. :param key: The name of the property in the sample to sort by. Use ``rename_mapping=``\:py:obj:`None` or omit the sample from the ``rename_mapping`` entirely to leave the name unchanged. For example: .. code-block:: python rename_mapping = { "Propellant 1ug +ve": "Alliant Unique 1µg/L +ESI", "Propellant 1mg +ve": "Alliant Unique 1mg/L +ESI", "Propellant 1mg -ve": None, } """ for sample in self: if getattr(sample, key) in rename_mapping and rename_mapping[getattr(sample, key)]: sample.sample_name = rename_mapping.pop(getattr(sample, key))
[docs] def get_areas_and_scores( self, compound_name: str, include_none: bool = False, ) -> Tuple[OrderedDict, OrderedDict]: """ Returns two dictionaries: one containing sample names and peak areas for the compound with the given name, the other containing sample names and scores. :param compound_name: :param include_none: Whether samples where the compound was not found should be included in the results. """ # noqa: D400 peak_areas: "OrderedDict[str, Optional[float]]" = OrderedDict() scores: "OrderedDict[str, Optional[Decimal]]" = OrderedDict() for sample in self: for result in sample.results_list: if result.name == compound_name: peak_areas[sample.sample_name] = result.area scores[sample.sample_name] = result.score break else: if include_none: peak_areas[sample.sample_name] = None scores[sample.sample_name] = None return peak_areas, scores
[docs] def get_retention_times(self, compound_name: str, include_none: bool = False) -> OrderedDict: """ Returns a dictionary containing sample names and retention times for the compound with the given name. :param compound_name: :param include_none: Whether samples where the compound was not found should be included in the results. """ # noqa: D400 times = OrderedDict() for sample in self: for result in sample.results_list: if result.name == compound_name: times[sample.sample_name] = float(result.rt) break else: if include_none: times[sample.sample_name] = numpy.nan return times
[docs] def get_peak_areas(self, compound_name: str, include_none: bool = False) -> OrderedDict: """ Returns a dictionary containing sample names and peak areas for the compound with the given name. :param compound_name: :param include_none: Whether samples where the compound was not found should be included in the results. """ # noqa: D400 return self.get_areas_and_scores(compound_name, include_none)[0]
[docs] def get_areas_for_compounds( self, compound_names: Iterable[str], include_none: bool = False, ) -> "SamplesAreaDict": """ Returns a dictionary containing sample names and peak areas for the compounds with the given names. :param compound_names: :param include_none: Whether samples where none of the specified compounds were found should be included in the results. """ # noqa: D400 all_areas, all_scores = self.get_areas_and_scores_for_compounds(compound_names, include_none) return all_areas
[docs] def get_areas_and_scores_for_compounds( self, compound_names: Iterable[str], include_none: bool = False, ) -> Tuple["SamplesAreaDict", "SamplesScoresDict"]: """ Returns two dictionaries: one containing sample names and peak areas for the compounds with the given names, the other containing sample names and scores. :param compound_names: :param include_none: Whether samples where none of the specified compounds were found should be included in the results. :rtype: .. clearpage:: """ # noqa: D400 tmp_all_areas = SamplesAreaDict() tmp_all_scores = SamplesScoresDict() for name in compound_names: areas = self.get_peak_areas(name, True) scores = self.get_scores(name, True) for sample_name, area in areas.items(): if sample_name not in tmp_all_areas: tmp_all_areas[sample_name] = dict() tmp_all_scores[sample_name] = dict() tmp_all_areas[sample_name][name] = area tmp_all_scores[sample_name][name] = scores[sample_name] if include_none: return tmp_all_areas, tmp_all_scores else: all_areas = SamplesAreaDict() all_scores = SamplesScoresDict() for sample_name, compound_areas in tmp_all_areas.items(): if any(list(compound_areas.values())): all_areas[sample_name] = compound_areas all_scores[sample_name] = tmp_all_scores[sample_name] return all_areas, all_scores
[docs] def get_compounds(self) -> List[str]: """ Returns a list containing the names of the compounds present in the samples in alphabetical order. """ compounds = set() for sample in self: for result in sample.results_list: compounds.add(result.name) return sorted(compounds)
[docs] def get_scores(self, compound_name: str, include_none: bool = False) -> OrderedDict: """ Returns a dictionary containing sample names and scores for the compound with the given name. :param compound_name: :param include_none: Whether samples where the compound was not found should be included in the results. :rtype: .. clearpage:: """ # noqa: D400 return self.get_areas_and_scores(compound_name, include_none)[1]
[docs] def filter( # noqa: A003 # pylint: disable=redefined-builtin self: _SL, sample_names: Iterable[str], key: str = "sample_name", exclude: bool = False, ) -> _SL: """ Filter the list to only contain sample_names whose name is in ``sample_names``. :param sample_names: A list of sample names to include :param key: The name of the property in the sample to sort by. :param exclude: If :py:obj:`True`, any sample whose name is in ``sample_names`` will be excluded from the output, rather than included. """ new_sample_list = self.__class__() for sample in self: if exclude: if getattr(sample, key) in sample_names: continue else: if getattr(sample, key) not in sample_names: continue new_sample_list.append(sample) return new_sample_list
@property def sample_names(self) -> List[str]: """ Returns a list of sample names in the :class:`~.classes.SampleList`. """ return [sample.sample_name for sample in self]
[docs] @classmethod def from_json_file(cls: Type[_SL], filename: PathLike, **kwargs) -> _SL: r""" Construct a :class:`~.classes.SampleList` from JSON file. :param filename: The filename of the JSON file. :param \*\*kwargs: Keyword arguments passed to :meth:`domdf_python_tools.paths.PathPlus.load_json`. """ all_samples = cls() for sample in PathPlus(filename).load_json( json_library=sdjson, # type: ignore **kwargs, ): all_samples.append(Sample(**sample)) return all_samples
[docs]class BaseSamplePropertyDict(OrderedDict): """ OrderedDict to store a single property of a set of samples. Keys are the sample names and the values are dictionaries mapping compound names to property values. """ @property def sample_names(self) -> List[str]: """ Returns a list of sample names in the :class:`~.BaseSamplePropertyDict`. """ return list(self.keys()) @property def n_samples(self) -> int: """ Returns the number of samples in the :class:`~.BaseSamplePropertyDict`. """ return len(self.keys()) @property def n_compounds(self) -> int: """ Returns the number of compounds in the :class:`~.BaseSamplePropertyDict`. """ for val in self.values(): return len(val) return 0
[docs]class SamplesAreaDict(BaseSamplePropertyDict): """ :class:`collections.OrderedDict` to store area information parsed from MassHunter results CSV files. """
[docs] def get_compound_areas(self, compound_name: str) -> List[float]: """ Get the peak areas for the given compound in every sample. :param compound_name: """ areas = [] for sample_name, compound_areas in self.items(): for name, area in compound_areas.items(): if compound_name == name: if area is None: areas.append(0.0) else: areas.append(area) return areas
[docs]class SamplesScoresDict(BaseSamplePropertyDict): """ :class:`collections.OrderedDict` to store score information parsed from MassHunter results CSV files. """
[docs] def get_compound_scores(self, compound_name: str) -> List[float]: """ Get the peak scores for the given compound in every sample. :param compound_name: """ scores = [] for sample_name, compound_scores in self.items(): for name, score in compound_scores.items(): if compound_name == name: if score is None: scores.append(0.0) else: scores.append(score) return scores
@sdjson.encoders.register(Sample) @sdjson.encoders.register(Result) def encode_result_or_sample(obj): # noqa: D103 return dict(obj) @sdjson.encoders.register(set) def encode_set(obj): # noqa: D103 return list(obj) @sdjson.encoders.register(Decimal) def encode_decimal(obj): # noqa: D103 return str(obj)