Source code for mh_utils.csv_parser

#!/usr/bin/env python3
#
#  __init__.py
"""
Parser for CSV result files produced by MassHunter Qualitative.

.. versionadded:: 0.2.0

.. autosummary::

	~mh_utils.csv_parser.ResultParser
	~mh_utils.csv_parser.parse_masshunter_csv
"""
#
#  Copyright © 2020-2021 Dominic Davis-Foster <dominic@davis-foster.co.uk>
#
#  Permission is hereby granted, free of charge, to any person obtaining a copy
#  of this software and associated documentation files (the "Software"), to deal
#  in the Software without restriction, including without limitation the rights
#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#  copies of the Software, and to permit persons to whom the Software is
#  furnished to do so, subject to the following conditions:
#
#  The above copyright notice and this permission notice shall be included in all
#  copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
#  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
#  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
#  OR OTHER DEALINGS IN THE SOFTWARE.
#

# stdlib
from typing import Iterable

# 3rd party
import pandas
import sdjson
from domdf_python_tools.paths import PathPlus
from domdf_python_tools.typing import PathLike

# this package
from mh_utils.csv_parser.classes import Result, Sample, SampleList
from mh_utils.csv_parser.utils import drop_columns, reorder_columns

__all__ = ["ResultParser", "parse_masshunter_csv", "Result", "SampleList", "Sample"]


[docs]class ResultParser:
	"""
	Given a directory of CSV results exported from MassHunter, parse them to CSV and JSON.

	:param raw_results_dir: The directory in which the raw exports from MassHunter are stored.
	:param json_results_dir: The directory to store the output json files in.
	:param csv_results_dir: The directory to store the output csv files in.
	"""

	def __init__(self, raw_results_dir: PathLike, json_results_dir: PathLike, csv_results_dir: PathLike):

		self.raw_results_dir = PathPlus(raw_results_dir)

		self.json_results_dir = PathPlus(json_results_dir)
		self.json_results_dir.maybe_make(parents=True)

		self.csv_results_dir = PathPlus(csv_results_dir)
		self.csv_results_dir.maybe_make(parents=True)

[docs]	def parse_for_directory(self, directory: PathLike) -> None:
		"""
		Convert the "CSV Results.csv" file in the given directory to CSV and JSON.

		:param directory:
		"""

		(self.json_results_dir / directory).maybe_make()
		(self.csv_results_dir / directory).maybe_make()

		infile = self.raw_results_dir / directory / "CSV Results.csv"
		csv_outfile = self.csv_results_dir / directory / "CSV Results Parsed.csv"
		json_outfile = self.json_results_dir / directory / "results.json"
		print(f"{infile} -> {csv_outfile}")
		print(f"{' ' * len(str(infile))} -> {json_outfile}")

		parse_masshunter_csv(infile, csv_outfile, json_outfile)

[docs]	def parse_directory_list(self, directory_list: Iterable[PathLike]) -> None:
		"""
		Runs :meth:`.~ResultsParser.parse_for_directory` for each directory in ``directory_list``.

		:param directory_list: A list of directories to process.
		"""

		for directory in directory_list:
			print(f"Processing directory {directory}")
			self.parse_for_directory(directory)


[docs]def parse_masshunter_csv(csv_file: PathLike, csv_outfile: PathLike, json_outfile: PathLike) -> None:
	"""
	Parse CSV results files created by MassHunter.

	:param csv_file:
	:param csv_outfile:
	:param json_outfile:
	"""

	# Read CSV file to data frame
	results_df = pandas.read_csv(csv_file, header=1, index_col=False, dtype=str)

	# drop unneeded columns
	drop_columns(results_df)

	rearranged_results_df = reorder_columns(results_df)
	rearranged_results_df.to_csv(csv_outfile, index=False)

	samples = SampleList()

	for row_idx, result in rearranged_results_df.iterrows():
		sample = samples.add_sample_from_series(result)
		tmp_result = Result.from_series(result)
		sample.add_result(tmp_result)

	PathPlus(json_outfile).dump_json(
			samples,
			json_library=sdjson,  # type: ignore[arg-type]
			indent=2,
			)
	# TODO: https://github.com/python/mypy/issues/5018
	# If it ever gets fixed