Source code for mdacli.save

#!/usr/bin/env python3
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-

# Copyright (c) 2021 Authors and contributors
#
# Released under the GNU Public Licence, v2 or any higher version
# SPDX-License-Identifier: GPL-2.0-or-later
"""Manage data saving."""
import json
import os
import sys
import zipfile
from functools import partial
from pathlib import Path

import numpy as np
from MDAnalysis.analysis.base import Results



[docs]
def save(results, fprefix="mdacli_results"):
    """
    Save the attributes of a results instance to disk.

    1D, 2D and 3D numpy arrays are saved to csv files. 1D arrays of the
    same length are vertically stacked to create a table. 2D arrays are
    saved directly. 3D arrays are split into 2D arrays along the
    shortest dimension and one CSV is saved for each 2D array created,
    and resulting CSVs are stored together in a ZIP file. Note: higher
    dimensional arrays are ignored.

    We try to save everything else in a JSON file. Non-serializable
    types are ignored.

    Parameters
    ----------
    fprefix : str
        prefix for all files saved

    results : `~MDAnalysis.analysis.base.Results`
        A Results instance from which the stored data is taken.
    """
    save_1D_arrays(results, fprefix=fprefix, remove=True)
    save_2D_arrays(results, fprefix=fprefix, remove=True)
    save_3D_arrays(results, fprefix=fprefix, remove=True)
    save_higher_dim_arrays(results, fprefix=fprefix, remove=True, min_ndim=4)
    save_json_serializables(results, remove=True, fname=fprefix)
    save_Results_object(results, fprefix=fprefix, remove=True)
    return




[docs]
def save_1D_arrays(results, fprefix="1darray", remove=True):
    """
    Save 1D arrays from results.

    Parameters
    ----------
    results : dict-like
        Dictionary containing results.

    remove : bool
        If true remove keys mapping to 1D numpy arrays.
    """
    list_1D, list_1D_labels = get_1D_arrays(results)

    if not list_1D:
        return

    out_lists, out_lables = stack_1d_arrays_list(list_1D, list_1D_labels)

    for out_list, out_label in zip(out_lists, out_lables):
        out_label = out_label.flatten()

        # [3:] to align lables with entries
        savetxt_w_command(
            fname=f"{fprefix}_{'_'.join(out_label)}.csv",
            X=out_list.T,
            header=''.join([f"{i:>25}" for i in out_label])[3:]
            )

    return return_with_remove(results, list_1D_labels, remove)




[docs]
def get_1D_arrays(results):
    """Get items from dict which correspond to np.ndarrays one dim."""
    list_1D = []
    list_1D_labels = []

    for key, value in results.items():
        if is_1d_array(value):
            list_1D.append(value)
            list_1D_labels.append(key)

    return list_1D, list_1D_labels




[docs]
def stack_1d_arrays_list(list_1D, extra_list=None):
    """Stack a list of 1D numpy arrays of the same length vertically together.

    The result is a list containing 2D arrays where each array got the same
    number of rows.

    Parameters
    ----------
    list_1d : list
        list of 1 dimensional numpy arrays

    extra_list : list
        additional list of numpy arrays on which the
        operations are executed as for ``list_1d``

    Returns
    -------
    out_list : list
        list of stacked 2D numpy arrays organized by their length
    out_extra : list
        list of stacked 2D numpy applied applied to the same operations
        as out_list
    """
    # Sort for lengths
    lengths = np.array([len(a) for a in list_1D])
    sorted_idx = np.argsort(lengths)

    # Sort lists according to the lengths of the items
    list_1D_sorted = [list_1D[i] for i in sorted_idx]

    # Count the number of items for each length
    counts = np.unique(lengths, return_counts=True)[1]
    new_length_idx = np.hstack([[0], np.cumsum(counts)])

    out_lists = []
    # Concentanate lists of the same lenngth
    for i in range(0, len(new_length_idx) - 1):
        out_lists.append(np.vstack(list_1D_sorted[new_length_idx[i]:
                                                  new_length_idx[i + 1]]))

    if extra_list is not None:
        extra_list_sorted = [extra_list[i] for i in sorted_idx]
        out_extra = []
        for i in range(0, len(new_length_idx) - 1):
            out_extra.append(np.vstack(extra_list_sorted[new_length_idx[i]:
                                                         new_length_idx[i + 1]]
                                       ))

        return out_lists, out_extra
    else:
        return out_lists




[docs]
def save_2D_arrays(results, fprefix="2Darr", remove=True):
    """Save items of 2D array."""
    keys = []
    for key, value in results.items():
        value = try_to_squeeze_me(value)
        if is_2d_array(value):
            savetxt_w_command(fname=f"{fprefix}_{key}.csv", X=value)
            keys.append(key)

    return return_with_remove(results, keys, remove)




[docs]
def save_3D_arrays(results, fprefix="3Darr", remove=True):
    """Save items of 2D array."""
    keys = []
    for key, value in results.items():
        value = try_to_squeeze_me(value)
        if is_3d_array(value):
            save_3D_array_to_2D_csv(
                value,
                arr_name=f"{fprefix}_{key}",
                zipit=True,
                )
            keys.append(key)

    return return_with_remove(results, keys, remove)




[docs]
def save_3D_array_to_2D_csv(
        item,
        arr_name='arr',
        zipit=True,
        ):
    """
    Save 3D array to 2D CSVs.

    Has option to store all in a ZIP file.
    """
    min_dim = np.argmin(item)
    files_to_zip = []

    # Split array along the dimension with smallest number
    # of entries
    splitted_item = np.split(
        item,
        item.shape[min_dim],
        axis=min_dim,
        )

    save_to_folder = not zipit
    folder = None
    if save_to_folder:
        folder = Path(arr_name)
        folder.mkdir(parents=True, exist_ok=True)

    for i, arr in enumerate(splitted_item):
        fname = f"{arr_name}_dim_{min_dim}_idx_{i}.csv"
        foutname = folder.joinpath(fname) if folder else fname
        savetxt_w_command(fname=foutname, X=np.squeeze(arr))
        files_to_zip.append(fname)

    if zipit:
        save_files_to_zip(files_to_zip, zipname=arr_name, remove=True)

    return




[docs]
def save_higher_dim_arrays(results, fprefix="XDarr", remove=True, min_ndim=4):
    """Save items of multidimensional arrays to CSV."""
    keys = []
    for key, value in results.items():
        value = try_to_squeeze_me(value)
        if is_higher_dimension_array(value, min_ndim):
            save_result_array(value, fprefix=fprefix, arr_name=key)
            keys.append(key)

    return return_with_remove(results, keys, remove)




[docs]
def save_result_array(arr, fprefix='prefix'):
    """Save array to disk accoring to num of dimensions."""
    item = np.squeeze(arr)

    save_options = {
        item.ndim == 1: save_1D_arrays,
        item.ndim == 2: save_2D_arrays,
        item.ndim == 3: save_3D_arrays,
        item.ndim > 3: save_higher_dim_arrays,
        }

    save_options[True](item, fprefix=fprefix)
    return




[docs]
def save_json_serializables(results, remove=True, **jsonargs):
    """Save serializable items to a JSON."""
    json_dict = {
        key: value
        for key, value in results.items()
        if is_serializable(value)
        }

    if remove:
        for key in json_dict.keys():
            results.pop(key)

    if json_dict:
        json_dict["command"] = get_cli_input()
        save_to_json(json_dict, **jsonargs)




[docs]
def is_serializable(value):
    """Assert if value is json serializable."""
    try:
        json.dumps(value)
        return True
    except (TypeError, OverflowError):
        return False




[docs]
def save_to_json(json_dict, fname='jdict', indent=4, sort_keys=True):
    """Save dictionary to JSON file."""
    with open(f'{fname}.json', 'w') as f:
        json.dump(json_dict, f, indent=indent, sort_keys=sort_keys)




[docs]
def save_Results_object(results, fprefix='results', remove=True):
    """Save results if they are Results objects."""
    keys = []
    for key, value in results.items():
        if isinstance(value, Results):
            save(f"{fprefix}_{key}", value)
            keys.append(key)

    return return_with_remove(results, keys, remove)




[docs]
def return_with_remove(ddict, keys, remove):
    """
    Serve all saving functions.

    If remove is true,
    Returns subset of keys from dict.
    Removes keys subset from original dict.

    Else, return None.
    """
    if remove:
        return {key: ddict.pop(key) for key in keys}

    else:
        return None




[docs]
def save_files_to_zip(files, zipname='thezip', remove=True):
    """
    Compress all files into a single zip archive.

    Parameters
    ----------
    files : list-like
        File names to save to the ZIP archive.

    zipname : str
        The name of the zip file without extension.

    remove : bool, option, default True
        Removes the original files.
    """
    with zipfile.ZipFile(f'{zipname}.zip', 'w') as zipF:
        for file_name in files:
            zipF.write(
                file_name,
                compress_type=zipfile.ZIP_DEFLATED,
                )

    if remove:
        remove_files(files)

    return




[docs]
def is_dimension_array(arr, ndim):
    """Assert value is array and of certain dimension."""
    valid = \
        isinstance(arr, np.ndarray) \
        and arr.ndim == ndim

    return valid




[docs]
def is_higher_dimension_array(arr, ndim):
    """Assert value is array and of certain dimension."""
    valid = \
        isinstance(arr, np.ndarray) \
        and arr.ndim > ndim

    return valid



is_1d_array = partial(is_dimension_array, ndim=1)
is_2d_array = partial(is_dimension_array, ndim=2)
is_3d_array = partial(is_dimension_array, ndim=3)



[docs]
def try_to_squeeze_me(arr):
    """Squeeze the arr if is array."""
    return np.squeeze(arr) if isinstance(arr, np.ndarray) else arr




[docs]
def remove_files(files):
    """Remove files from disk."""
    for filename in files:
        os.remove(filename)
    return




[docs]
def savetxt_w_command(fname, X, header='', fsuffix=".csv", **kwargs):
    """
    Save CSV data with info about execution command.

    Adds the command line input to the header and checks for a doubled
    defined filesuffix.
    """
    header = "{}\n{}".format(get_cli_input(), header)
    fname = "{}{}".format(fname, (not fname.endswith(fsuffix)) * fsuffix)
    np.savetxt(
        fname,
        X,
        header=header,
        fmt="%-20s",
        **kwargs)




[docs]
def get_cli_input():
    """Return a proper fomatted string of the command line input."""
    program_name = os.path.basename(sys.argv[0])
    # Add additional quotes for connected arguments.
    arguments = [
        '"{}"'.format(arg).strip()
        if " " in arg else arg
        for arg in sys.argv[1:]
        ]

    return "Command line was: {} {}".format(program_name, " ".join(arguments))