Source code for mh_utils.cef_parser

#!/usr/bin/env python3
#
#  __init__.py
r"""
Parser for MassHunter Compound Exchange Format ``.cef`` files.

A CEF file represents a file identified in LC-MS data by MassHunter Qualitative.
It consists of a list of compounds encapsulated in a :class:`~.CompoundList`.

A :class:`~.CompoundList` consists of :class:`~.Compound` objects representing the
individual compounds identified in the data. Each :class:`~.Compound` object contains
information on the location of that compound within the LC data (:attr:`~.Compound.location`),
the scores indicating the confidence of the match (:attr:`~.Compound.compound_scores`),
a list of possible matching compounds (:attr:`~.Compound.results`),
and the matching mass spectrum extracted from the LC-MS data (:attr:`~.Compound.spectra`).

.. container:: structure-diagram

	The following diagram represents this structure:

	* :class:`CompoundList`

		+ :class:`Compound`

			- :attr:`Compound.algo` ⇨ :class:`str`
			- :attr:`Compound.location` ⇨ :py:obj:`~typing.Optional` [:class:`LocationDict`\]
			- :attr:`Compound.compound_scores` ⇨ :py:obj:`~typing.Optional` [:class:`~typing.Dict` [:class:`str`, :class:`~.Score`\]\]
			- :attr:`Compound.results` ⇨ :class:`~typing.List`

				- :class:`~.Molecule`
				- Another :class:`~.Molecule`
				- ``...``

			- :attr:`Compound.spectra` ⇨ :class:`~typing.List`

				- :class:`~.Spectrum`
				- Another :class:`~.Spectrum`
				- ``...``



		+ Another :class:`Compound`
		+ ``...``


.. clearpage::
"""
#
#  Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
import datetime
import re
from pprint import pformat
from typing import Dict, Iterable, List, Optional, Sequence, Type, Union

# 3rd party
import attr
import lxml.objectify  # type: ignore
from attr_utils.docstrings import add_attrs_doc
from attr_utils.serialise import serde
from chemistry_tools.formulae import Formula
from domdf_python_tools.bases import Dictable, NamedList
from domdf_python_tools.doctools import prettify_docstrings
from domdf_python_tools.pretty_print import FancyPrinter
from domdf_python_tools.stringlist import DelimitedList
from domdf_python_tools.typing import PathLike
from typing_extensions import TypedDict

# this package
from mh_utils.utils import make_timedelta

__all__ = [
		"Molecule",
		"Device",
		"Peak",
		"Spectrum",
		"make_timedelta",
		"RTRange",
		"Flag",
		"Score",
		"parse_compound_scores",
		"parse_match_scores",
		"LocationDict",
		"Compound",
		"CompoundList",
		"parse_cef",
		]


[docs]class Molecule(Dictable): """ Represents a molecule in a CEF file. :param name: The name of the compound :param formula: The formula of the compound. If a string it must be parsable by :class:`chemistry_tools.formulae.Formula` :param matches: Dictionary of algo: score match values. """ def __init__( self, name: str, formula: Union[str, Formula, None] = None, matches: Optional[Dict[str, "Score"]] = None, ): super().__init__() self.name = str(name) if isinstance(formula, Formula): self.formula = formula elif formula is not None: self.formula = Formula.from_string(formula) else: self.formula = Formula() if isinstance(matches, dict): self.matches = matches elif matches is None: self.matches = {} else: raise TypeError(f"'matches' must be a dictionary, not {type(matches)}") @property def __dict__(self): return dict( name=self.name, formula=self.formula, matches=self.matches, )
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Molecule": """ Construct a :class:`~.Molecule` object from an XML element. :param element: a Molecule XML element """ return cls( name=element.attrib["name"], formula=element.attrib["formula"], matches=parse_match_scores(element.MatchScores), )
[docs] def __repr__(self) -> str: """ Returns a string representation of the :class:`~mh_utils.cef_parser.Molecule`. """ return f"<Molecule({self.name}, {repr(self.formula)})>"
[docs] def __str__(self) -> str: """ Returns the molecule as a string. """ return f"Molecule({self.name}, {str(self.formula)})"
[docs]@serde @add_attrs_doc @attr.s(slots=True) class Device: """ Represents the device that acquired a :class:`~.Spectrum`. """ #: String identifying the type of device. device_type: str = attr.ib(converter=str) number: int = attr.ib(converter=int)
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Device": """ Construct a :class:`~.Device` object from an XML element. :param element: a ``<Device>`` XML element from a CEF file """ device_type = element.attrib["type"] number = element.attrib["num"] return cls(device_type=device_type, number=number)
[docs]@serde @add_attrs_doc @attr.s(slots=True) class Peak: """ A peak in a Mass Spectrum. """ x: float = attr.ib(converter=float) rx: float = attr.ib(converter=float) y: float = attr.ib(converter=float) charge: int = attr.ib(converter=int, default=0) #: The charge on the peak. label: str = attr.ib(converter=str, default='') #: The label of the peak. e.g. "M+H"
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Peak": """ Construct a :class:`~.Peak` object from an XML element. :param element: a ``<p>`` XML element from an <MSPeaks> element of a CEF file """ data = dict(element.attrib) data["charge"] = data.pop('z', 0) data["label"] = data.pop('s', '') return cls(**data)
[docs]class Spectrum(Dictable): """ Agilent CEF Spectrum. :param spectrum_type: The type of spectrum e.g. ``'FbF'``. :param algorithm: The algorithm used to identify the compound. :param saturation_limit: Unknown. Might mean saturation limit? :param scans: Unknown. Presumably the number of scans that make up the spectrum? :param scan_type: :param ionisation: The type of ionisation e.g. ESI. :param polarity: The polarity of the ionisation. :param device: The device that acquired the data. :param peaks: A list of identified peaks in the mass spectrum. :param rt_ranges: A list of retention time ranges for the mass spectrum. """ def __init__( self, spectrum_type: str = '', algorithm: str = '', saturation_limit: int = 0, scans: int = 0, scan_type: str = '', ionisation: str = '', polarity: Union[str, int] = 0, voltage: Union[str, float] = 0.0, device: Optional[Device] = None, peaks: Optional[Sequence[Peak]] = None, rt_ranges: Optional[Sequence["RTRange"]] = None, ): super().__init__() self.spectrum_type = str(spectrum_type) self.saturation_limit = int(saturation_limit) self.scans = int(scans) self.algorithm = str(algorithm) self.scan_type = str(scan_type) self.ionisation = str(ionisation) if isinstance(voltage, str): m = re.match(r"([0-9]+\.?[0-9]*)", voltage) if m is not None and m.group(1): self.voltage = float(m.group(1)) else: self.voltage = 0 else: self.voltage = float(voltage) self.polarity: int if polarity in {'+', 1, '1'}: self.polarity = 1 elif polarity in {'-', -1, "-1"}: self.polarity = -1 elif isinstance(polarity, str) and polarity.lower() == "positive": self.polarity = 1 elif isinstance(polarity, str) and polarity.lower() == "negative": self.polarity = -1 else: self.polarity = int(polarity) self.device = device if peaks is None: self.peaks = [] else: self.peaks = list(peaks) if rt_ranges is None: self.rt_ranges = [] else: self.rt_ranges = list(rt_ranges) __slots__ = [ "spectrum_type", "saturation_limit", "scans", "algorithm", "scan_type", "ionisation", "voltage", "polarity", "device", "peaks", "rt_ranges", ] @property def __dict__(self): data = {} for key in self.__slots__: data[key] = getattr(self, key) return data
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Spectrum": """ Construct a :class:`~.Spectrum` object from an XML element. :param element: a Spectrum XML element from a CEF file """ data = {} data["spectrum_type"] = element.attrib["type"] data["algorithm"] = element.attrib["cpdAlgo"] if "satLimit" in element.attrib: data["saturation_limit"] = element.attrib["satLimit"] if "scans" in element.attrib: data["scans"] = element.attrib["scans"] data["scan_type"] = element.MSDetails.attrib["scanType"] data["ionisation"] = element.MSDetails.attrib["is"] data["polarity"] = element.MSDetails.attrib['p'] if "fv" in element.MSDetails.attrib: data["voltage"] = element.MSDetails.attrib["fv"] data["device"] = Device.from_xml(element.Device) data["peaks"] = [Peak.from_xml(p) for p in element.MSPeaks.findall('p')] if element.findall("RTRanges"): data["rt_ranges"] = [RTRange.from_xml(r) for r in element.RTRanges.findall("RTRange")] # TODO: <MassCalibration> return cls(**data)
def __repr__(self) -> str: """ Returns a string representation of the :class:`~mh_utils.cef_parser.Spectrum`. """ return f"<Spectrum({pformat(self.peaks)})>"
[docs]@serde @add_attrs_doc @attr.s(slots=True) class RTRange: """ Represents an ``<RTRange>`` element from a CEF file. """ #: The start time in minutes start: datetime.timedelta = attr.ib(converter=make_timedelta, default=0.0) # type: ignore #: The end time in minutes end: datetime.timedelta = attr.ib(converter=make_timedelta, default=0.0) # type: ignore
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "RTRange": """ Construct ab :class:`~.RTRange` object from an XML element. :param element: The ``<RTRange>`` XML element to parse the data from. """ start = element.attrib["min"] end = element.attrib["max"] return cls(start, end)
# TODO: Subclass these from UserString and UserFloat
[docs]@prettify_docstrings class Flag(str): """ Represents a flag in a score, to warn that the identification of a compound is poor. :param string: The text of the flag :param severity: The severity of the flag """ __slots__ = ("severity", ) severity: int def __copy__(self): return Flag(str(self), self.severity) def __deepcopy__(self, memodict={}): return Flag(str(self), int(self.severity)) def __new__(cls: Type["Flag"], string: str, severity: int) -> "Flag": # noqa: D102 obj = super().__new__(cls, str(string)) obj.severity = int(severity) return obj
[docs] def __eq__(self, other) -> bool: if isinstance(other, Flag): return str(self) == str(other) and self.severity == other.severity else: return super().__eq__(other)
[docs] def __ne__(self, other) -> bool: return NotImplemented
def __repr__(self) -> str: """ Returns a string representation of the :class:`~mh_utils.cef_parser.Flag`. """ return f"{self.__class__.__name__}({str(self)!r}, severity={self.severity})"
[docs] def __bool__(self) -> bool: """ Returns a boolean representation of the :class:`~mh_utils.cef_parser.Flag`. """ return bool(str(self)) and bool(self.severity)
[docs]@prettify_docstrings class Score(float): """ A score indicating how well the compound matches the observed spectrum. :param score: The score :param flag_string: Optional flag. See :class:`~.Flag` for details. :param flag_severity: The severity of the flag. """ flag: Flag def __copy__(self): return Score(float(self), str(self.flag), self.flag.severity) def __deepcopy__(self, memodict={}): return Score(float(self), str(self.flag), int(self.flag.severity)) def __init__(self, score, flag_string: str = '', flag_severity: int = 0): float.__init__(float(score)) def __new__(cls, score, flag_string: str = '', flag_severity: int = 0) -> "Score": # noqa: D102 obj = super().__new__(cls, float(score)) obj.flag = Flag(flag_string, flag_severity) return obj def __repr__(self) -> str: """ Returns a string representation of the :class:`~mh_utils.cef_parser.Score`. """ if self.flag: return f"{self.__class__.__name__}({str(self)}, {self.flag!r})" else: return f"{self.__class__.__name__}({str(self)})" def __str__(self) -> str: """ Returns the :class:`~mh_utils.cef_parser.Score` as a string. """ return str(float(self)) def __eq__(self, other) -> bool: if isinstance(other, Score): return float(self) == float(other) and self.flag == other.flag else: return super().__eq__(other) def __ne__(self, other) -> bool: return NotImplemented
[docs]def parse_compound_scores(element: lxml.objectify.ObjectifiedElement) -> Dict[str, Score]: """ Parse a ``<CompoundScores>`` element into a mapping of algorithms to scores. :param element: a CompoundScores XML element. """ compound_scores: Dict[str, Score] = {} for score in element.findall("CpdScore"): algo: str = score.attrib["algo"] score = Score( score.attrib["score"], score.attrib.get("tgtFlagsString", ''), score.attrib.get("tgtFlagsSeverity", 0), ) compound_scores[algo] = score return compound_scores
[docs]def parse_match_scores(element: lxml.objectify.ObjectifiedElement) -> Dict[str, Score]: """ Parse a ``<MatchScores>`` element into a mapping of algorithms to scores. :param element: a MatchScores XML element. """ match_scores: Dict[str, Score] = {} for score in element.findall("Match"): algo: str = score.attrib["algo"] score = Score( score.attrib["score"], score.attrib.get("tgtFlagsString", ''), score.attrib.get("tgtFlagsSeverity", 0), ) match_scores[algo] = score return match_scores
[docs]class LocationDict(TypedDict, total=False): """ :class:`~typing.TypedDict` representing the location of a spectrum within mass spectrometry data. """ m: float #: the accurate mass of the compound, determined from the observed mass spectrum. rt: float #: The retention time at which the compound was detected. a: float #: The area of the peak in the EIC. y: float #: The height of the peak in the EIC.
class _CompoundStrPPrinter(FancyPrinter): def _repr(self, object, context, level): # noqa: A002 # pylint: disable=redefined-builtin if isinstance(object, (Molecule, Formula)): self._readable = True self._recursive = False return str(object) else: return super()._repr(object, context, level)
[docs]class Compound(Dictable): """ Represents a compound identified in mass spectral data by MassHunter Qualitative. :param algo: The algorithm used to identify the compound. :param location: A dictionary of information to locate the compound in the spectral data. :param compound_scores: A dictionary of compound scores. :param results: A list of molecules that match the spectrum. :param spectra: A list of spectra for the compound. """ algo: str #: The algorithm used to identify the compound. location: LocationDict #: A dictionary of information to locate the compound in the spectral data. compound_scores: Dict[str, "Score"] #: A dictionary of compound scores. results: List[Molecule] #: A list of molecules that match the spectrum. spectra: List[Spectrum] #: A list of spectra for the compound. def __init__( self, algo: str = '', location: Optional[LocationDict] = None, compound_scores: Optional[Dict[str, "Score"]] = None, results: Optional[Sequence[Molecule]] = None, spectra: Optional[Sequence[Spectrum]] = None, ): super().__init__() self.algo = str(algo) if location: self.location = location else: self.location = {} if compound_scores: self.compound_scores = compound_scores else: self.compound_scores = {} if results: self.results = list(results) else: self.results = [] if spectra: self.spectra = list(spectra) else: self.spectra = [] @property def __dict__(self): return dict( algo=self.algo, location=self.location, compound_scores=self.compound_scores, results=self.results, spectra=self.spectra, )
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Compound": """ Construct a :class:`~.Compound` object from an XML element. :param element: a Compound XML element from a CEF file. """ location: LocationDict = {} if 'm' in element.Location.attrib: location['m'] = float(element.Location.attrib['m']) if "rt" in element.Location.attrib: location["rt"] = float(element.Location.attrib["rt"]) if 'a' in element.Location.attrib: location['a'] = int(element.Location.attrib['a']) if 'y' in element.Location.attrib: location['y'] = int(element.Location.attrib['y']) results: List[Molecule] = [] for molecule in element.Results.findall("Molecule"): results.append(Molecule.from_xml(molecule)) spectra: List[Spectrum] = [] for spectrum in element.findall("Spectrum"): spectra.append(Spectrum.from_xml(spectrum)) return cls( algo=element.attrib["algo"], location=location, compound_scores=parse_compound_scores(element.CompoundScores), results=results, spectra=spectra, )
[docs] def __repr__(self) -> str: """ Returns a string representation of the :class:`~mh_utils.cef_parser.Compound`. """ results_repr = FancyPrinter(indent=4, width=80, depth=None, compact=False).pformat(self.results) return f"<Compound({results_repr})>"
[docs] def __str__(self) -> str: """ Returns the :class:`~mh_utils.cef_parser.Compound` as a string. """ results_length = 0 for molecule in self.results: results_length += ( 7 # "Molecule()" + len(molecule.name) + len(str(molecule.formula)) ) if results_length > 78: results_str = _CompoundStrPPrinter(indent=4, width=80, depth=None, compact=False).pformat(self.results) else: results_str = f"[{DelimitedList(self.results):, }]" return f"Compound({results_str})"
[docs]class CompoundList(NamedList): """ A list of Compound objects parsed from a CEF file. The full :class:`list` API is available for this class. :param instrument: String identifying the instrument that acquired the data. :param compounds: List of compounds identified in the mass spectrometry data. """ instrument: str #: The type of instrument that obtained the data, e.g. ``"LCQTOF"``. def __init__(self, instrument: str = '', compounds: Optional[Iterable[Compound]] = None): super().__init__(compounds) self.instrument = str(instrument)
[docs] @classmethod def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "CompoundList": """ Construct a :class:`~.CompoundList` object from an XML element. :param element: The XML element to parse the data from. """ return cls( instrument=element.attrib["instrumentConfiguration"], compounds=(Compound.from_xml(compound) for compound in element.findall("Compound")), )
[docs] def __str__(self) -> str: """ Returns the list as a string. """ return f"{self.__class__.__name__}{pformat(list(self))}"
[docs]def parse_cef(filename: PathLike) -> CompoundList: """ Construct an :class:`~.CompoundList` object from the given ``.cef`` file. :param filename: The filename of the CEF file to read. """ tree = lxml.objectify.parse(str(filename)) root = tree.getroot() version = root.attrib["version"] compounds = CompoundList.from_xml(root.CompoundList) return compounds