#!/usr/bin/env python3
#
# __init__.py
r"""
Parser for MassHunter Compound Exchange Format ``.cef`` files.
A CEF file represents a file identified in LC-MS data by MassHunter Qualitative.
It consists of a list of compounds encapsulated in a :class:`~.CompoundList`.
A :class:`~.CompoundList` consists of :class:`~.Compound` objects representing the
individual compounds identified in the data. Each :class:`~.Compound` object contains
information on the location of that compound within the LC data (:attr:`~.Compound.location`),
the scores indicating the confidence of the match (:attr:`~.Compound.compound_scores`),
a list of possible matching compounds (:attr:`~.Compound.results`),
and the matching mass spectrum extracted from the LC-MS data (:attr:`~.Compound.spectra`).
.. container:: structure-diagram
The following diagram represents this structure:
* :class:`CompoundList`
+ :class:`Compound`
- :attr:`Compound.algo` ⇨ :class:`str`
- :attr:`Compound.location` ⇨ :py:obj:`~typing.Optional` [:class:`LocationDict`\]
- :attr:`Compound.compound_scores` ⇨ :py:obj:`~typing.Optional` [:class:`~typing.Dict` [:class:`str`, :class:`~.Score`\]\]
- :attr:`Compound.results` ⇨ :class:`~typing.List`
- :class:`~.Molecule`
- Another :class:`~.Molecule`
- ``...``
- :attr:`Compound.spectra` ⇨ :class:`~typing.List`
- :class:`~.Spectrum`
- Another :class:`~.Spectrum`
- ``...``
+ Another :class:`Compound`
+ ``...``
.. clearpage::
"""
#
# Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
#
# stdlib
import datetime
import re
from pprint import pformat
from typing import Dict, Iterable, List, Optional, Sequence, Type, Union
# 3rd party
import attr
import lxml.objectify # type: ignore
from attr_utils.docstrings import add_attrs_doc
from attr_utils.serialise import serde
from chemistry_tools.formulae import Formula
from domdf_python_tools.bases import Dictable, NamedList
from domdf_python_tools.doctools import prettify_docstrings
from domdf_python_tools.pretty_print import FancyPrinter
from domdf_python_tools.stringlist import DelimitedList
from domdf_python_tools.typing import PathLike
from typing_extensions import TypedDict
# this package
from mh_utils.utils import make_timedelta
__all__ = [
"Molecule",
"Device",
"Peak",
"Spectrum",
"make_timedelta",
"RTRange",
"Flag",
"Score",
"parse_compound_scores",
"parse_match_scores",
"LocationDict",
"Compound",
"CompoundList",
"parse_cef",
]
[docs]class Molecule(Dictable):
"""
Represents a molecule in a CEF file.
:param name: The name of the compound
:param formula: The formula of the compound.
If a string it must be parsable by :class:`chemistry_tools.formulae.Formula`
:param matches: Dictionary of algo: score match values.
"""
def __init__(
self,
name: str,
formula: Union[str, Formula, None] = None,
matches: Optional[Dict[str, "Score"]] = None,
):
super().__init__()
self.name = str(name)
if isinstance(formula, Formula):
self.formula = formula
elif formula is not None:
self.formula = Formula.from_string(formula)
else:
self.formula = Formula()
if isinstance(matches, dict):
self.matches = matches
elif matches is None:
self.matches = {}
else:
raise TypeError(f"'matches' must be a dictionary, not {type(matches)}")
@property
def __dict__(self):
return dict(
name=self.name,
formula=self.formula,
matches=self.matches,
)
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Molecule":
"""
Construct a :class:`~.Molecule` object from an XML element.
:param element: a Molecule XML element
"""
return cls(
name=element.attrib["name"],
formula=element.attrib["formula"],
matches=parse_match_scores(element.MatchScores),
)
[docs] def __repr__(self) -> str:
"""
Returns a string representation of the :class:`~mh_utils.cef_parser.Molecule`.
"""
return f"<Molecule({self.name}, {repr(self.formula)})>"
[docs] def __str__(self) -> str:
"""
Returns the molecule as a string.
"""
return f"Molecule({self.name}, {str(self.formula)})"
[docs]@serde
@add_attrs_doc
@attr.s(slots=True)
class Device:
"""
Represents the device that acquired a :class:`~.Spectrum`.
"""
#: String identifying the type of device.
device_type: str = attr.ib(converter=str)
number: int = attr.ib(converter=int)
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Device":
"""
Construct a :class:`~.Device` object from an XML element.
:param element: a ``<Device>`` XML element from a CEF file
"""
device_type = element.attrib["type"]
number = element.attrib["num"]
return cls(device_type=device_type, number=number)
[docs]@serde
@add_attrs_doc
@attr.s(slots=True)
class Peak:
"""
A peak in a Mass Spectrum.
"""
x: float = attr.ib(converter=float)
rx: float = attr.ib(converter=float)
y: float = attr.ib(converter=float)
charge: int = attr.ib(converter=int, default=0) #: The charge on the peak.
label: str = attr.ib(converter=str, default='') #: The label of the peak. e.g. "M+H"
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Peak":
"""
Construct a :class:`~.Peak` object from an XML element.
:param element: a ``<p>`` XML element from an <MSPeaks> element of a CEF file
"""
data = dict(element.attrib)
data["charge"] = data.pop('z', 0)
data["label"] = data.pop('s', '')
return cls(**data)
[docs]class Spectrum(Dictable):
"""
Agilent CEF Spectrum.
:param spectrum_type: The type of spectrum e.g. ``'FbF'``.
:param algorithm: The algorithm used to identify the compound.
:param saturation_limit: Unknown. Might mean saturation limit?
:param scans: Unknown. Presumably the number of scans that make up the spectrum?
:param scan_type:
:param ionisation: The type of ionisation e.g. ESI.
:param polarity: The polarity of the ionisation.
:param device: The device that acquired the data.
:param peaks: A list of identified peaks in the mass spectrum.
:param rt_ranges: A list of retention time ranges for the mass spectrum.
"""
def __init__(
self,
spectrum_type: str = '',
algorithm: str = '',
saturation_limit: int = 0,
scans: int = 0,
scan_type: str = '',
ionisation: str = '',
polarity: Union[str, int] = 0,
voltage: Union[str, float] = 0.0,
device: Optional[Device] = None,
peaks: Optional[Sequence[Peak]] = None,
rt_ranges: Optional[Sequence["RTRange"]] = None,
):
super().__init__()
self.spectrum_type = str(spectrum_type)
self.saturation_limit = int(saturation_limit)
self.scans = int(scans)
self.algorithm = str(algorithm)
self.scan_type = str(scan_type)
self.ionisation = str(ionisation)
if isinstance(voltage, str):
m = re.match(r"([0-9]+\.?[0-9]*)", voltage)
if m is not None and m.group(1):
self.voltage = float(m.group(1))
else:
self.voltage = 0
else:
self.voltage = float(voltage)
self.polarity: int
if polarity in {'+', 1, '1'}:
self.polarity = 1
elif polarity in {'-', -1, "-1"}:
self.polarity = -1
elif isinstance(polarity, str) and polarity.lower() == "positive":
self.polarity = 1
elif isinstance(polarity, str) and polarity.lower() == "negative":
self.polarity = -1
else:
self.polarity = int(polarity)
self.device = device
if peaks is None:
self.peaks = []
else:
self.peaks = list(peaks)
if rt_ranges is None:
self.rt_ranges = []
else:
self.rt_ranges = list(rt_ranges)
__slots__ = [
"spectrum_type",
"saturation_limit",
"scans",
"algorithm",
"scan_type",
"ionisation",
"voltage",
"polarity",
"device",
"peaks",
"rt_ranges",
]
@property
def __dict__(self):
data = {}
for key in self.__slots__:
data[key] = getattr(self, key)
return data
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Spectrum":
"""
Construct a :class:`~.Spectrum` object from an XML element.
:param element: a Spectrum XML element from a CEF file
"""
data = {}
data["spectrum_type"] = element.attrib["type"]
data["algorithm"] = element.attrib["cpdAlgo"]
if "satLimit" in element.attrib:
data["saturation_limit"] = element.attrib["satLimit"]
if "scans" in element.attrib:
data["scans"] = element.attrib["scans"]
data["scan_type"] = element.MSDetails.attrib["scanType"]
data["ionisation"] = element.MSDetails.attrib["is"]
data["polarity"] = element.MSDetails.attrib['p']
if "fv" in element.MSDetails.attrib:
data["voltage"] = element.MSDetails.attrib["fv"]
data["device"] = Device.from_xml(element.Device)
data["peaks"] = [Peak.from_xml(p) for p in element.MSPeaks.findall('p')]
if element.findall("RTRanges"):
data["rt_ranges"] = [RTRange.from_xml(r) for r in element.RTRanges.findall("RTRange")]
# TODO: <MassCalibration>
return cls(**data)
def __repr__(self) -> str:
"""
Returns a string representation of the :class:`~mh_utils.cef_parser.Spectrum`.
"""
return f"<Spectrum({pformat(self.peaks)})>"
[docs]@serde
@add_attrs_doc
@attr.s(slots=True)
class RTRange:
"""
Represents an ``<RTRange>`` element from a CEF file.
"""
#: The start time in minutes
start: datetime.timedelta = attr.ib(converter=make_timedelta, default=0.0) # type: ignore
#: The end time in minutes
end: datetime.timedelta = attr.ib(converter=make_timedelta, default=0.0) # type: ignore
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "RTRange":
"""
Construct ab :class:`~.RTRange` object from an XML element.
:param element: The ``<RTRange>`` XML element to parse the data from.
"""
start = element.attrib["min"]
end = element.attrib["max"]
return cls(start, end)
# TODO: Subclass these from UserString and UserFloat
[docs]@prettify_docstrings
class Flag(str):
"""
Represents a flag in a score, to warn that the identification of a compound is poor.
:param string: The text of the flag
:param severity: The severity of the flag
"""
__slots__ = ("severity", )
severity: int
def __copy__(self):
return Flag(str(self), self.severity)
def __deepcopy__(self, memodict={}):
return Flag(str(self), int(self.severity))
def __new__(cls: Type["Flag"], string: str, severity: int) -> "Flag": # noqa: D102
obj = super().__new__(cls, str(string))
obj.severity = int(severity)
return obj
[docs] def __eq__(self, other) -> bool:
if isinstance(other, Flag):
return str(self) == str(other) and self.severity == other.severity
else:
return super().__eq__(other)
[docs] def __ne__(self, other) -> bool:
return NotImplemented
def __repr__(self) -> str:
"""
Returns a string representation of the :class:`~mh_utils.cef_parser.Flag`.
"""
return f"{self.__class__.__name__}({str(self)!r}, severity={self.severity})"
[docs] def __bool__(self) -> bool:
"""
Returns a boolean representation of the :class:`~mh_utils.cef_parser.Flag`.
"""
return bool(str(self)) and bool(self.severity)
[docs]@prettify_docstrings
class Score(float):
"""
A score indicating how well the compound matches the observed spectrum.
:param score: The score
:param flag_string: Optional flag. See :class:`~.Flag` for details.
:param flag_severity: The severity of the flag.
"""
flag: Flag
def __copy__(self):
return Score(float(self), str(self.flag), self.flag.severity)
def __deepcopy__(self, memodict={}):
return Score(float(self), str(self.flag), int(self.flag.severity))
def __init__(self, score, flag_string: str = '', flag_severity: int = 0):
float.__init__(float(score))
def __new__(cls, score, flag_string: str = '', flag_severity: int = 0) -> "Score": # noqa: D102
obj = super().__new__(cls, float(score))
obj.flag = Flag(flag_string, flag_severity)
return obj
def __repr__(self) -> str:
"""
Returns a string representation of the :class:`~mh_utils.cef_parser.Score`.
"""
if self.flag:
return f"{self.__class__.__name__}({str(self)}, {self.flag!r})"
else:
return f"{self.__class__.__name__}({str(self)})"
def __str__(self) -> str:
"""
Returns the :class:`~mh_utils.cef_parser.Score` as a string.
"""
return str(float(self))
def __eq__(self, other) -> bool:
if isinstance(other, Score):
return float(self) == float(other) and self.flag == other.flag
else:
return super().__eq__(other)
def __ne__(self, other) -> bool:
return NotImplemented
[docs]def parse_compound_scores(element: lxml.objectify.ObjectifiedElement) -> Dict[str, Score]:
"""
Parse a ``<CompoundScores>`` element into a mapping of algorithms to scores.
:param element: a CompoundScores XML element.
"""
compound_scores: Dict[str, Score] = {}
for score in element.findall("CpdScore"):
algo: str = score.attrib["algo"]
score = Score(
score.attrib["score"],
score.attrib.get("tgtFlagsString", ''),
score.attrib.get("tgtFlagsSeverity", 0),
)
compound_scores[algo] = score
return compound_scores
[docs]def parse_match_scores(element: lxml.objectify.ObjectifiedElement) -> Dict[str, Score]:
"""
Parse a ``<MatchScores>`` element into a mapping of algorithms to scores.
:param element: a MatchScores XML element.
"""
match_scores: Dict[str, Score] = {}
for score in element.findall("Match"):
algo: str = score.attrib["algo"]
score = Score(
score.attrib["score"],
score.attrib.get("tgtFlagsString", ''),
score.attrib.get("tgtFlagsSeverity", 0),
)
match_scores[algo] = score
return match_scores
[docs]class LocationDict(TypedDict, total=False):
"""
:class:`~typing.TypedDict` representing the location of a spectrum within mass spectrometry data.
"""
m: float #: the accurate mass of the compound, determined from the observed mass spectrum.
rt: float #: The retention time at which the compound was detected.
a: float #: The area of the peak in the EIC.
y: float #: The height of the peak in the EIC.
class _CompoundStrPPrinter(FancyPrinter):
def _repr(self, object, context, level): # noqa: A002 # pylint: disable=redefined-builtin
if isinstance(object, (Molecule, Formula)):
self._readable = True
self._recursive = False
return str(object)
else:
return super()._repr(object, context, level)
[docs]class Compound(Dictable):
"""
Represents a compound identified in mass spectral data by MassHunter Qualitative.
:param algo: The algorithm used to identify the compound.
:param location: A dictionary of information to locate the compound in the spectral data.
:param compound_scores: A dictionary of compound scores.
:param results: A list of molecules that match the spectrum.
:param spectra: A list of spectra for the compound.
"""
algo: str #: The algorithm used to identify the compound.
location: LocationDict #: A dictionary of information to locate the compound in the spectral data.
compound_scores: Dict[str, "Score"] #: A dictionary of compound scores.
results: List[Molecule] #: A list of molecules that match the spectrum.
spectra: List[Spectrum] #: A list of spectra for the compound.
def __init__(
self,
algo: str = '',
location: Optional[LocationDict] = None,
compound_scores: Optional[Dict[str, "Score"]] = None,
results: Optional[Sequence[Molecule]] = None,
spectra: Optional[Sequence[Spectrum]] = None,
):
super().__init__()
self.algo = str(algo)
if location:
self.location = location
else:
self.location = {}
if compound_scores:
self.compound_scores = compound_scores
else:
self.compound_scores = {}
if results:
self.results = list(results)
else:
self.results = []
if spectra:
self.spectra = list(spectra)
else:
self.spectra = []
@property
def __dict__(self):
return dict(
algo=self.algo,
location=self.location,
compound_scores=self.compound_scores,
results=self.results,
spectra=self.spectra,
)
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "Compound":
"""
Construct a :class:`~.Compound` object from an XML element.
:param element: a Compound XML element from a CEF file.
"""
location: LocationDict = {}
if 'm' in element.Location.attrib:
location['m'] = float(element.Location.attrib['m'])
if "rt" in element.Location.attrib:
location["rt"] = float(element.Location.attrib["rt"])
if 'a' in element.Location.attrib:
location['a'] = int(element.Location.attrib['a'])
if 'y' in element.Location.attrib:
location['y'] = int(element.Location.attrib['y'])
results: List[Molecule] = []
for molecule in element.Results.findall("Molecule"):
results.append(Molecule.from_xml(molecule))
spectra: List[Spectrum] = []
for spectrum in element.findall("Spectrum"):
spectra.append(Spectrum.from_xml(spectrum))
return cls(
algo=element.attrib["algo"],
location=location,
compound_scores=parse_compound_scores(element.CompoundScores),
results=results,
spectra=spectra,
)
[docs] def __repr__(self) -> str:
"""
Returns a string representation of the :class:`~mh_utils.cef_parser.Compound`.
"""
results_repr = FancyPrinter(indent=4, width=80, depth=None, compact=False).pformat(self.results)
return f"<Compound({results_repr})>"
[docs] def __str__(self) -> str:
"""
Returns the :class:`~mh_utils.cef_parser.Compound` as a string.
"""
results_length = 0
for molecule in self.results:
results_length += (
7 # "Molecule()"
+ len(molecule.name) + len(str(molecule.formula))
)
if results_length > 78:
results_str = _CompoundStrPPrinter(indent=4, width=80, depth=None, compact=False).pformat(self.results)
else:
results_str = f"[{DelimitedList(self.results):, }]"
return f"Compound({results_str})"
[docs]class CompoundList(NamedList):
"""
A list of Compound objects parsed from a CEF file.
The full :class:`list` API is available for this class.
:param instrument: String identifying the instrument that acquired the data.
:param compounds: List of compounds identified in the mass spectrometry data.
"""
instrument: str #: The type of instrument that obtained the data, e.g. ``"LCQTOF"``.
def __init__(self, instrument: str = '', compounds: Optional[Iterable[Compound]] = None):
super().__init__(compounds)
self.instrument = str(instrument)
[docs] @classmethod
def from_xml(cls, element: lxml.objectify.ObjectifiedElement) -> "CompoundList":
"""
Construct a :class:`~.CompoundList` object from an XML element.
:param element: The XML element to parse the data from.
"""
return cls(
instrument=element.attrib["instrumentConfiguration"],
compounds=(Compound.from_xml(compound) for compound in element.findall("Compound")),
)
[docs] def __str__(self) -> str:
"""
Returns the list as a string.
"""
return f"{self.__class__.__name__}{pformat(list(self))}"
[docs]def parse_cef(filename: PathLike) -> CompoundList:
"""
Construct an :class:`~.CompoundList` object from the given ``.cef`` file.
:param filename: The filename of the CEF file to read.
"""
tree = lxml.objectify.parse(str(filename))
root = tree.getroot()
version = root.attrib["version"]
compounds = CompoundList.from_xml(root.CompoundList)
return compounds