Source code for dpdata.lammps.lmp

#!/usr/bin/env python3
from __future__ import annotations

import numpy as np

ptr_float_fmt = "%15.10f"
ptr_int_fmt = "%6d"
ptr_key_fmt = "%15s"

# Mapping of LAMMPS atom styles to their column layouts
# Format: (atom_id_col, atom_type_col, x_col, y_col, z_col, has_molecule_id, has_charge, charge_col)
ATOM_STYLE_COLUMNS = {
    "atomic": (0, 1, 2, 3, 4, False, False, None),
    "angle": (0, 2, 3, 4, 5, True, False, None),
    "bond": (0, 2, 3, 4, 5, True, False, None),
    "charge": (0, 1, 3, 4, 5, False, True, 2),
    "full": (0, 2, 4, 5, 6, True, True, 3),
    "molecular": (0, 2, 3, 4, 5, True, False, None),
    "dipole": (0, 1, 3, 4, 5, False, True, 2),
    "sphere": (0, 1, 4, 5, 6, False, False, None),
}



[docs]
def detect_atom_style(lines: list[str]) -> str | None:
    """Detect LAMMPS atom style from data file content.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file

    Returns
    -------
    str or None
        Detected atom style, or None if not detected
    """
    # Look for atom style in comments after "Atoms" section header
    atom_lines = get_atoms(lines)
    if not atom_lines:
        return None

    # Find the "Atoms" line
    for idx, line in enumerate(lines):
        if "Atoms" in line:
            # Check if there's a comment with atom style after "Atoms"
            if "#" in line:
                comment_part = line.split("#")[1].strip().lower()
                for style in ATOM_STYLE_COLUMNS:
                    if style in comment_part:
                        return style
            break

    # If no explicit style found, try to infer from first data line
    if atom_lines:
        first_line = atom_lines[0].split()
        num_cols = len(first_line)

        # Try to match based on number of columns and content patterns
        # This is a heuristic approach
        if num_cols == 5:
            # Could be atomic style: atom-ID atom-type x y z
            return "atomic"
        elif num_cols == 6:
            # Could be charge or bond/molecular style
            # Try to determine if column 2 (index 2) looks like a charge (float) or type (int)
            try:
                val = float(first_line[2])
                # If it's a small float, likely a charge
                if abs(val) < 10 and val != int(val):
                    return "charge"
                else:
                    # Likely molecule ID (integer), so bond/molecular style
                    return "bond"
            except ValueError:
                return "atomic"  # fallback
        elif num_cols == 7:
            # Could be full style: atom-ID molecule-ID atom-type charge x y z
            return "full"
        elif num_cols >= 8:
            # Could be dipole or sphere style
            # For now, default to dipole if we have enough columns
            return "dipole"

    return None  # Unable to detect



def _get_block(lines, keys):
    for idx in range(len(lines)):
        if keys in lines[idx]:
            break
    if idx == len(lines) - 1:
        return None
    idx_s = idx + 2
    idx = idx_s
    ret = []
    while True:
        if idx == len(lines) or len(lines[idx].split()) == 0:
            break
        else:
            ret.append(lines[idx])
        idx += 1
    return ret



[docs]
def lmpbox2box(lohi, tilt):
    xy = tilt[0]
    xz = tilt[1]
    yz = tilt[2]
    orig = np.array([lohi[0][0], lohi[1][0], lohi[2][0]])
    lens = []
    for dd in range(3):
        lens.append(lohi[dd][1] - lohi[dd][0])
    xx = [lens[0], 0, 0]
    yy = [xy, lens[1], 0]
    zz = [xz, yz, lens[2]]
    return orig, np.array([xx, yy, zz])




[docs]
def box2lmpbox(orig, box):
    lohi = np.zeros([3, 2])
    for dd in range(3):
        lohi[dd][0] = orig[dd]
    tilt = np.zeros(3)
    tilt[0] = box[1][0]
    tilt[1] = box[2][0]
    tilt[2] = box[2][1]
    lens = np.zeros(3)
    lens[0] = box[0][0]
    lens[1] = box[1][1]
    lens[2] = box[2][2]
    for dd in range(3):
        lohi[dd][1] = lohi[dd][0] + lens[dd]
    return lohi, tilt




[docs]
def get_atoms(lines):
    return _get_block(lines, "Atoms")




[docs]
def get_natoms(lines):
    for ii in lines:
        if "atoms" in ii:
            return int(ii.split()[0])
    return None




[docs]
def get_natomtypes(lines):
    for ii in lines:
        if "atom types" in ii:
            return int(ii.split()[0])
    return None



def _atom_info_mol(line):
    vec = line.split()
    # idx, mole_type, atom_type, charge, x, y, z
    return (
        int(vec[0]),
        int(vec[1]),
        int(vec[2]),
        float(vec[3]),
        float(vec[4]),
        float(vec[5]),
        float(vec[6]),
    )


def _atom_info_atom(line):
    vec = line.split()
    # idx, atom_type, x, y, z
    return int(vec[0]), int(vec[1]), float(vec[2]), float(vec[3]), float(vec[4])


def _atom_info_style(line: str, atom_style: str = "atomic") -> dict[str, int | float]:
    """Parse atom information based on the specified atom style.

    Parameters
    ----------
    line : str
        The atom line from LAMMPS data file
    atom_style : str
        The LAMMPS atom style (atomic, full, charge, etc.)

    Returns
    -------
    dict
        Dictionary containing parsed atom information with keys:
        'atom_id', 'atom_type', 'x', 'y', 'z', 'molecule_id' (if present), 'charge' (if present)
    """
    if atom_style not in ATOM_STYLE_COLUMNS:
        raise ValueError(
            f"Unsupported atom style: {atom_style}. Supported styles: {list(ATOM_STYLE_COLUMNS.keys())}"
        )

    vec = line.split()
    columns = ATOM_STYLE_COLUMNS[atom_style]

    result = {
        "atom_id": int(vec[columns[0]]),
        "atom_type": int(vec[columns[1]]),
        "x": float(vec[columns[2]]),
        "y": float(vec[columns[3]]),
        "z": float(vec[columns[4]]),
    }

    # Add molecule ID if present
    if columns[5]:  # has_molecule_id
        result["molecule_id"] = int(
            vec[1]
        )  # molecule ID is always in column 1 when present

    # Add charge if present
    if columns[6]:  # has_charge
        result["charge"] = float(vec[columns[7]])  # charge_col

    return result



[docs]
def get_natoms_vec(lines: list[str], atom_style: str = "atomic") -> list[int]:
    """Get number of atoms for each atom type.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    atom_style : str
        The LAMMPS atom style

    Returns
    -------
    list
        Number of atoms for each atom type
    """
    atype = get_atype(lines, atom_style=atom_style)
    natoms_vec = []
    natomtypes = get_natomtypes(lines)
    for ii in range(natomtypes):
        natoms_vec.append(sum(atype == ii + 1))
    assert sum(natoms_vec) == get_natoms(lines)
    return natoms_vec




[docs]
def get_atype(
    lines: list[str], type_idx_zero: bool = False, atom_style: str = "atomic"
) -> np.ndarray:
    """Get atom types from LAMMPS data file.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    type_idx_zero : bool
        Whether to use zero-based indexing for atom types
    atom_style : str
        The LAMMPS atom style

    Returns
    -------
    np.ndarray
        Array of atom types
    """
    alines = get_atoms(lines)
    atype = []
    for ii in alines:
        atom_info = _atom_info_style(ii, atom_style)
        at = atom_info["atom_type"]
        if type_idx_zero:
            atype.append(at - 1)
        else:
            atype.append(at)
    return np.array(atype, dtype=int)




[docs]
def get_posi(lines: list[str], atom_style: str = "atomic") -> np.ndarray:
    """Get atomic positions from LAMMPS data file.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    atom_style : str
        The LAMMPS atom style

    Returns
    -------
    np.ndarray
        Array of atomic positions
    """
    atom_lines = get_atoms(lines)
    posis = []
    for ii in atom_lines:
        atom_info = _atom_info_style(ii, atom_style)
        posis.append([atom_info["x"], atom_info["y"], atom_info["z"]])
    return np.array(posis)




[docs]
def get_charges(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
    """Get atomic charges from LAMMPS data file if the atom style supports charges.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    atom_style : str
        The LAMMPS atom style

    Returns
    -------
    np.ndarray or None
        Array of atomic charges if atom style has charges, None otherwise
    """
    if atom_style not in ATOM_STYLE_COLUMNS:
        raise ValueError(f"Unsupported atom style: {atom_style}")

    # Check if this atom style has charges
    if not ATOM_STYLE_COLUMNS[atom_style][6]:  # has_charge
        return None

    atom_lines = get_atoms(lines)
    charges = []
    for ii in atom_lines:
        atom_info = _atom_info_style(ii, atom_style)
        charges.append(atom_info["charge"])
    return np.array(charges)




[docs]
def get_spins(lines: list[str], atom_style: str = "atomic") -> np.ndarray | None:
    atom_lines = get_atoms(lines)
    if len(atom_lines[0].split()) < 8:
        return None
    spins_ori = []
    spins_norm = []
    for ii in atom_lines:
        iis = ii.split()
        spins_ori.append([float(jj) for jj in iis[5:8]])
        spins_norm.append([float(iis[-1])])
    return np.array(spins_ori) * np.array(spins_norm)




[docs]
def get_lmpbox(lines):
    box_info = []
    tilt = np.zeros(3)
    for ii in lines:
        if "xlo" in ii and "xhi" in ii:
            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
            break
    for ii in lines:
        if "ylo" in ii and "yhi" in ii:
            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
            break
    for ii in lines:
        if "zlo" in ii and "zhi" in ii:
            box_info.append([float(ii.split()[0]), float(ii.split()[1])])
            break
    for ii in lines:
        if "xy" in ii and "xz" in ii and "yz" in ii:
            tilt = np.array([float(jj) for jj in ii.split()[0:3]])
    return box_info, tilt




[docs]
def system_data(
    lines: list[str],
    type_map: list[str] | None = None,
    type_idx_zero: bool = True,
    atom_style: str = "atomic",
) -> dict:
    """Parse LAMMPS data file to system data format.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    type_map : list, optional
        Mapping from atom types to element names
    type_idx_zero : bool
        Whether to use zero-based indexing for atom types
    atom_style : str
        The LAMMPS atom style (atomic, full, charge, etc.)

    Returns
    -------
    dict
        System data dictionary
    """
    system = {}
    system["atom_numbs"] = get_natoms_vec(lines, atom_style=atom_style)
    system["atom_names"] = []
    if type_map is None:
        for ii in range(len(system["atom_numbs"])):
            system["atom_names"].append("Type_%d" % ii)  # noqa: UP031
    else:
        assert len(type_map) >= len(system["atom_numbs"])
        for ii in range(len(system["atom_numbs"])):
            system["atom_names"].append(type_map[ii])
    lohi, tilt = get_lmpbox(lines)
    orig, cell = lmpbox2box(lohi, tilt)
    system["orig"] = np.array(orig)
    system["cells"] = [np.array(cell)]
    natoms = sum(system["atom_numbs"])
    system["atom_types"] = get_atype(
        lines, type_idx_zero=type_idx_zero, atom_style=atom_style
    )
    system["coords"] = [get_posi(lines, atom_style=atom_style)]
    system["cells"] = np.array(system["cells"])
    system["coords"] = np.array(system["coords"])

    # Add charges if the atom style supports them
    charges = get_charges(lines, atom_style=atom_style)
    if charges is not None:
        system["charges"] = np.array([charges])

    spins = get_spins(lines, atom_style=atom_style)
    if spins is not None:
        system["spins"] = np.array([spins])

    return system




[docs]
def to_system_data(
    lines: list[str],
    type_map: list[str] | None = None,
    type_idx_zero: bool = True,
    atom_style: str = "atomic",
) -> dict:
    """Parse LAMMPS data file to system data format.

    Parameters
    ----------
    lines : list
        Lines from LAMMPS data file
    type_map : list, optional
        Mapping from atom types to element names
    type_idx_zero : bool
        Whether to use zero-based indexing for atom types
    atom_style : str
        The LAMMPS atom style. If "auto", attempts to detect automatically
        from file. Default is "atomic".

    Returns
    -------
    dict
        System data dictionary
    """
    # Attempt automatic detection if requested
    if atom_style == "auto":
        detected_style = detect_atom_style(lines)
        if detected_style:
            atom_style = detected_style
        else:
            atom_style = "atomic"  # fallback to default

    return system_data(
        lines, type_map=type_map, type_idx_zero=type_idx_zero, atom_style=atom_style
    )




[docs]
def rotate_to_lower_triangle(
    cell: np.ndarray, coord: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
    """Rotate the cell to lower triangular and ensure the diagonal elements are non-negative.

    Args:
        cell (np.ndarray): The original cell matrix.
        coord (np.ndarray): The coordinates of the atoms.

    Returns
    -------
    tuple[np.ndarray, np.ndarray]: The rotated cell and adjusted coordinates.
    """
    q, _ = np.linalg.qr(cell.T)
    cell = np.matmul(cell, q)
    coord = np.matmul(coord, q)

    # Ensure the diagonal elements of the cell are non-negative
    rot = np.eye(3)
    if cell[0][0] < 0:
        rot[0][0] = -1
    if cell[1][1] < 0:
        rot[1][1] = -1
    if cell[2][2] < 0:
        rot[2][2] = -1
    cell = np.matmul(cell, rot)
    coord = np.matmul(coord, rot)
    return cell, coord




[docs]
def from_system_data(system, f_idx=0):
    ret = ""
    ret += "\n"
    natoms = sum(system["atom_numbs"])
    ntypes = len(system["atom_numbs"])
    cell, coord = rotate_to_lower_triangle(
        system["cells"][f_idx], system["coords"][f_idx]
    )
    ret += "%d atoms\n" % natoms  # noqa: UP031
    ret += "%d atom types\n" % ntypes  # noqa: UP031
    ret += (ptr_float_fmt + " " + ptr_float_fmt + " xlo xhi\n") % (
        0,
        cell[0][0],
    )  # noqa: UP031
    ret += (ptr_float_fmt + " " + ptr_float_fmt + " ylo yhi\n") % (
        0,
        cell[1][1],
    )  # noqa: UP031
    ret += (ptr_float_fmt + " " + ptr_float_fmt + " zlo zhi\n") % (
        0,
        cell[2][2],
    )  # noqa: UP031
    ret += (
        ptr_float_fmt + " " + ptr_float_fmt + " " + ptr_float_fmt + " xy xz yz\n"
    ) % (
        cell[1][0],
        cell[2][0],
        cell[2][1],
    )  # noqa: UP031
    ret += "\n"
    ret += "Atoms # atomic\n"
    ret += "\n"
    coord_fmt = (
        ptr_int_fmt
        + " "
        + ptr_int_fmt
        + " "
        + ptr_float_fmt
        + " "
        + ptr_float_fmt
        + " "
        + ptr_float_fmt
        + "\n"
    )  # noqa: UP031

    if "spins" in system:
        coord_fmt = (
            coord_fmt.strip("\n")
            + " "
            + ptr_float_fmt
            + " "
            + ptr_float_fmt
            + " "
            + ptr_float_fmt
            + " "
            + ptr_float_fmt
            + "\n"
        )  # noqa: UP031
        spins_norm = np.linalg.norm(system["spins"][f_idx], axis=1)
    for ii in range(natoms):
        if "spins" in system:
            if spins_norm[ii] != 0:
                ret += coord_fmt % (
                    ii + 1,
                    system["atom_types"][ii] + 1,
                    coord[ii][0] - system["orig"][0],
                    coord[ii][1] - system["orig"][1],
                    coord[ii][2] - system["orig"][2],
                    system["spins"][f_idx][ii][0] / spins_norm[ii],
                    system["spins"][f_idx][ii][1] / spins_norm[ii],
                    system["spins"][f_idx][ii][2] / spins_norm[ii],
                    spins_norm[ii],
                )  # noqa: UP031
            else:
                ret += coord_fmt % (
                    ii + 1,
                    system["atom_types"][ii] + 1,
                    coord[ii][0] - system["orig"][0],
                    coord[ii][1] - system["orig"][1],
                    coord[ii][2] - system["orig"][2],
                    system["spins"][f_idx][ii][0],
                    system["spins"][f_idx][ii][1],
                    system["spins"][f_idx][ii][2] + 1,
                    spins_norm[ii],
                )  # noqa: UP031
        else:
            ret += coord_fmt % (
                ii + 1,
                system["atom_types"][ii] + 1,
                coord[ii][0] - system["orig"][0],
                coord[ii][1] - system["orig"][1],
                coord[ii][2] - system["orig"][2],
            )  # noqa: UP031
    return ret



if __name__ == "__main__":
    fname = "water-SPCE.data"
    lines = open(fname).read().split("\n")
    bonds, tilt = get_lmpbox(lines)
    # print(bonds, tilt)
    orig, box = lmpbox2box(bonds, tilt)
    # print(orig, box)
    bonds1, tilt1 = box2lmpbox(orig, box)
    # print(bonds1, tilt1)
    print(bonds1 - bonds)
    print(tilt1 - tilt)
    print(box)
    print(get_atype(lines))
    print(get_posi(lines))