Source code for dpdata.deepmd.hdf5

"""Utils for deepmd/hdf5 format."""

from __future__ import annotations

import warnings

try:
    import h5py
except ImportError:
    pass
import numpy as np
from wcmatch.glob import globfilter

import dpdata

__all__ = ["to_system_data", "dump"]


[docs] def to_system_data( f: h5py.File | h5py.Group, folder: str, type_map: list | None = None, labels: bool = True, ): """Load a HDF5 file. Parameters ---------- f : h5py.File or h5py.Group HDF5 file or group object folder : str path in the HDF5 file type_map : list type map labels : bool labels """ g = f[folder] if folder else f data = {} # ignore empty files or groups if "type.raw" not in g.keys(): return data data["atom_types"] = g["type.raw"][:] ntypes = np.max(data["atom_types"]) + 1 natoms = data["atom_types"].size data["atom_numbs"] = [] for ii in range(ntypes): data["atom_numbs"].append(np.count_nonzero(data["atom_types"] == ii)) data["atom_names"] = [] # if find type_map.raw, use it if "type_map.raw" in g.keys(): my_type_map = list(np.char.decode(g["type_map.raw"][:])) # else try to use arg type_map elif type_map is not None: my_type_map = type_map # in the last case, make artificial atom names else: my_type_map = [] for ii in range(ntypes): my_type_map.append("Type_%d" % ii) assert len(my_type_map) >= len(data["atom_numbs"]) for ii in range(len(data["atom_numbs"])): data["atom_names"].append(my_type_map[ii]) data["orig"] = np.zeros([3]) if "nopbc" in g.keys(): data["nopbc"] = True sets = globfilter(g.keys(), "set.*") data_types = { "cells": { "fn": "box", "labeled": False, "shape": (3, 3), "required": "nopbc" not in data, }, "coords": { "fn": "coord", "labeled": False, "shape": (natoms, 3), "required": True, }, "energies": { "fn": "energy", "labeled": True, "shape": tuple(), "required": False, }, "forces": { "fn": "force", "labeled": True, "shape": (natoms, 3), "required": False, }, "virials": { "fn": "virial", "labeled": True, "shape": (3, 3), "required": False, }, } # allow custom dtypes for dtype in dpdata.system.LabeledSystem.DTYPES: if dtype.name in ( "atom_numbs", "atom_names", "atom_types", "orig", "cells", "coords", "real_atom_types", "real_atom_names", "nopbc", "energies", "forces", "virials", ): # skip as these data contains specific rules continue if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): warnings.warn( f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted from deepmd/hdf5 format." ) continue shape = [ natoms if xx == dpdata.system.Axis.NATOMS else xx for xx in dtype.shape[1:] ] data_types[dtype.name] = { "fn": dtype.name, "labeled": True, "shape": shape, "required": False, } for dt, prop in data_types.items(): all_data = [] for ii in sets: set = g[ii] fn = "%s.npy" % prop["fn"] if fn in set.keys(): dd = set[fn][:] nframes = dd.shape[0] all_data.append(np.reshape(dd, (nframes, *prop["shape"]))) elif prop["required"]: raise RuntimeError(f"{folder}/{ii}/{fn} not found") if len(all_data) > 0: data[dt] = np.concatenate(all_data, axis=0) if "cells" not in data: nframes = data["coords"].shape[0] data["cells"] = np.zeros((nframes, 3, 3)) return data
[docs] def dump( f: h5py.File | h5py.Group, folder: str, data: dict, set_size=5000, comp_prec=np.float32, ) -> None: """Dump data to a HDF5 file. Parameters ---------- f : h5py.File or h5py.Group HDF5 file or group object folder : str path in the HDF5 file data : dict System or LabeledSystem data set_size : int, default: 5000 size of a set comp_prec : np.dtype, default: np.float32 precision of data """ # if folder is None, use the root of the file if folder: if folder in f: del f[folder] g = f.create_group(folder) else: g = f # ignore empty systems if not len(data["coords"]): return # dump raw (array in fact) g.create_dataset("type.raw", data=data["atom_types"]) g.create_dataset("type_map.raw", data=np.array(data["atom_names"], dtype="S")) # BondOrder System if "bonds" in data: g.create_dataset("bonds.raw", data=data["bonds"]) if "formal_charges" in data: g.create_dataset("formal_charges.raw", data=data["formal_charges"]) # reshape frame properties and convert prec nframes = data["cells"].shape[0] nopbc = data.get("nopbc", False) reshaped_data = {} data_types = { "cells": {"fn": "box", "shape": (nframes, 9), "dump": not nopbc}, "coords": {"fn": "coord", "shape": (nframes, -1), "dump": True}, "energies": {"fn": "energy", "shape": (nframes,), "dump": True}, "forces": {"fn": "force", "shape": (nframes, -1), "dump": True}, "virials": {"fn": "virial", "shape": (nframes, 9), "dump": True}, } # allow custom dtypes for dtype in dpdata.system.LabeledSystem.DTYPES: if dtype.name in ( "atom_numbs", "atom_names", "atom_types", "orig", "cells", "coords", "real_atom_types", "real_atom_names", "nopbc", "energies", "forces", "virials", ): # skip as these data contains specific rules continue if not (len(dtype.shape) and dtype.shape[0] == dpdata.system.Axis.NFRAMES): warnings.warn( f"Shape of {dtype.name} is not (nframes, ...), but {dtype.shape}. This type of data will not converted to deepmd/hdf5 format." ) continue data_types[dtype.name] = { "fn": dtype.name, "shape": (nframes, -1), "dump": True, } for dt, prop in data_types.items(): if dt in data: if prop["dump"]: reshaped_data[dt] = np.reshape(data[dt], prop["shape"]).astype( comp_prec ) # dump frame properties: cell, coord, energy, force and virial nsets = nframes // set_size if set_size * nsets < nframes: nsets += 1 for ii in range(nsets): set_stt = ii * set_size set_end = (ii + 1) * set_size set_folder = g.create_group("set.%03d" % ii) for dt, prop in data_types.items(): if dt in reshaped_data: set_folder.create_dataset( "%s.npy" % prop["fn"], data=reshaped_data[dt][set_stt:set_end] ) if nopbc: g.create_dataset("nopbc", data=True)