Source code for dpdata.cp2k.output

# %%
import math
import re
from collections import OrderedDict

import numpy as np

from ..unit import (
    EnergyConversion,
    ForceConversion,
    LengthConversion,
    PressureConversion,
)
from .cell import cell_to_low_triangle

AU_TO_ANG = LengthConversion("bohr", "angstrom").value()
AU_TO_EV = EnergyConversion("hartree", "eV").value()
AU_TO_EV_EVERY_ANG = ForceConversion("hartree/bohr", "eV/angstrom").value()
delimiter_patterns = []
delimiter_p1 = re.compile(r"^ \* GO CP2K GO! \*+")
delimiter_p2 = re.compile(r"^ \*+")
delimiter_patterns.append(delimiter_p1)
delimiter_patterns.append(delimiter_p2)
avail_patterns = []
avail_patterns.append(re.compile(r"^ INITIAL POTENTIAL ENERGY"))
avail_patterns.append(re.compile(r"^ ENSEMBLE TYPE"))


[docs] class Cp2kSystems: """deal with cp2k outputfile.""" def __init__(self, log_file_name, xyz_file_name, restart=False): self.log_file_object = open(log_file_name) self.xyz_file_object = open(xyz_file_name) self.log_block_generator = self.get_log_block_generator() self.xyz_block_generator = self.get_xyz_block_generator() self.restart_flag = restart self.cell = None self.print_level = None self.atomic_kinds = None if self.restart_flag: self.handle_single_log_frame(next(self.log_block_generator)) def __del__(self): self.log_file_object.close() self.xyz_file_object.close() def __iter__(self): return self def __next__(self): info_dict = {} log_info_dict = self.handle_single_log_frame(next(self.log_block_generator)) # print(log_info_dict) xyz_info_dict = self.handle_single_xyz_frame(next(self.xyz_block_generator)) # eq1 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_numbs'], xyz_info_dict['atom_numbs'])] # eq2 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_names'], xyz_info_dict['atom_names'])] # eq3 = [v1==v2 for v1,v2 in zip(log_info_dict['atom_types'], xyz_info_dict['atom_types'])] # assert all(eq1), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') # assert all(eq2), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') # assert all(eq3), (log_info_dict,xyz_info_dict,'There may be errors in the file. If it is a restart task; use restart=True') assert math.isclose( log_info_dict["energies"], xyz_info_dict["energies"], abs_tol=1.0e-6 ), ( log_info_dict["energies"], xyz_info_dict["energies"], "There may be errors in the file", ) info_dict.update(log_info_dict) info_dict.update(xyz_info_dict) return info_dict
[docs] def get_log_block_generator(self): lines = [] delimiter_flag = False yield_flag = False while True: line = self.log_file_object.readline() if line: lines.append(line) if any(p.match(line) for p in delimiter_patterns): if delimiter_flag is True: yield_flag = True yield lines lines = [] delimiter_flag = False else: line = self.log_file_object.readline() lines.append(line) if any(p.match(line) for p in avail_patterns): delimiter_flag = True else: if not yield_flag: raise StopIteration("None of the delimiter patterns are matched") break if delimiter_flag is True: raise RuntimeError("This file lacks some content, please check")
[docs] def get_xyz_block_generator(self): p3 = re.compile(r"^\s*(\d+)\s*") yield_flag = False while True: line = self.xyz_file_object.readline() if not line: if not yield_flag: raise StopIteration("None of the xyz patterns are matched") break if p3.match(line): yield_flag = True atom_num = int(p3.match(line).group(1)) lines = [] lines.append(line) for ii in range(atom_num + 1): lines.append(self.xyz_file_object.readline()) if not lines[-1]: raise RuntimeError( f"this xyz file may lack of lines, should be {atom_num + 2};lines:{lines}" ) yield lines
[docs] def handle_single_log_frame(self, lines): info_dict = {} energy_pattern_1 = re.compile( r" INITIAL POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)" ) # CONSERVED QUANTITY [hartree] = -0.279168013085E+04 energy_pattern_2 = re.compile( r" POTENTIAL ENERGY\[hartree\]\s+=\s+(?P<number>\S+)" ) energy = None cell_length_pattern = re.compile( r" (INITIAL ){0,1}CELL LNTHS\[bohr\]\s+=\s+(?P<A>\S+)\s+(?P<B>\S+)\s+(?P<C>\S+)" ) cell_angle_pattern = re.compile( r" (INITIAL ){0,1}CELL ANGLS\[deg\]\s+=\s+(?P<alpha>\S+)\s+(?P<beta>\S+)\s+(?P<gamma>\S+)" ) cell_A, cell_B, cell_C = ( 0, 0, 0, ) cell_alpha, cell_beta, cell_gamma = ( 0, 0, 0, ) cell_a_pattern = re.compile( r" CELL\| Vector a \[angstrom\]:\s+(?P<ax>\S+)\s+(?P<ay>\S+)\s+(?P<az>\S+)" ) cell_b_pattern = re.compile( r" CELL\| Vector b \[angstrom\]:\s+(?P<bx>\S+)\s+(?P<by>\S+)\s+(?P<bz>\S+)" ) cell_c_pattern = re.compile( r" CELL\| Vector c \[angstrom\]:\s+(?P<cx>\S+)\s+(?P<cy>\S+)\s+(?P<cz>\S+)" ) force_start_pattern = re.compile(r" ATOMIC FORCES in") force_flag = False force_end_pattern = re.compile(r" SUM OF ATOMIC FORCES") force_lines = [] cell_flag = 0 print_level_pattern = re.compile( r" GLOBAL\| Global print level\s+(?P<print_level>\S+)" ) print_level_flag = 0 atomic_kinds_pattern = re.compile(r"\s+\d+\. Atomic kind:\s+(?P<akind>\S+)") atomic_kinds = [] stress_sign = "STRESS" stress_flag = 0 stress = [] for line in lines: if stress_flag == 3: if line == "\n": stress_flag = 0 else: stress.append(line.split()[1:4]) if stress_flag == 2: stress_flag = 3 if stress_flag == 1: stress_flag = 2 if stress_sign in line: stress_flag = 1 if force_start_pattern.match(line): force_flag = True if force_end_pattern.match(line): assert force_flag is True, ( force_flag, "there may be errors in this file ", ) force_flag = False if force_flag is True: force_lines.append(line) if energy_pattern_1.match(line): energy = ( float(energy_pattern_1.match(line).groupdict()["number"]) * AU_TO_EV ) # print('1to', energy) if energy_pattern_2.match(line): energy = ( float(energy_pattern_2.match(line).groupdict()["number"]) * AU_TO_EV ) if cell_length_pattern.match(line): cell_A = ( float(cell_length_pattern.match(line).groupdict()["A"]) * AU_TO_ANG ) cell_B = ( float(cell_length_pattern.match(line).groupdict()["B"]) * AU_TO_ANG ) cell_C = ( float(cell_length_pattern.match(line).groupdict()["C"]) * AU_TO_ANG ) cell_flag += 1 if cell_angle_pattern.match(line): cell_alpha = np.deg2rad( float(cell_angle_pattern.match(line).groupdict()["alpha"]) ) cell_beta = np.deg2rad( float(cell_angle_pattern.match(line).groupdict()["beta"]) ) cell_gamma = np.deg2rad( float(cell_angle_pattern.match(line).groupdict()["gamma"]) ) cell_flag += 1 if print_level_pattern.match(line): print_level = print_level_pattern.match(line).groupdict()["print_level"] print_level_flag += 1 if cell_a_pattern.match(line): cell_ax = float(cell_a_pattern.match(line).groupdict()["ax"]) cell_ay = float(cell_a_pattern.match(line).groupdict()["ay"]) cell_az = float(cell_a_pattern.match(line).groupdict()["az"]) cell_flag += 1 if cell_b_pattern.match(line): cell_bx = float(cell_b_pattern.match(line).groupdict()["bx"]) cell_by = float(cell_b_pattern.match(line).groupdict()["by"]) cell_bz = float(cell_b_pattern.match(line).groupdict()["bz"]) cell_flag += 1 if cell_c_pattern.match(line): cell_cx = float(cell_c_pattern.match(line).groupdict()["cx"]) cell_cy = float(cell_c_pattern.match(line).groupdict()["cy"]) cell_cz = float(cell_c_pattern.match(line).groupdict()["cz"]) cell_flag += 1 if atomic_kinds_pattern.match(line): akind = atomic_kinds_pattern.match(line).groupdict()["akind"] atomic_kinds.append(akind) if print_level_flag == 1: self.print_level = print_level if print_level == "LOW": raise RuntimeError( "please provide cp2k output with higher print level(at least MEDIUM)" ) if cell_flag == 2: self.cell = cell_to_low_triangle( cell_A, cell_B, cell_C, cell_alpha, cell_beta, cell_gamma ) elif cell_flag == 5: self.cell = np.asarray( [ [cell_ax, cell_ay, cell_az], [cell_bx, cell_by, cell_bz], [cell_cx, cell_cy, cell_cz], ] ).astype("float64") if atomic_kinds: self.atomic_kinds = atomic_kinds # print(self.atomic_kinds) # lx = cell_A # xy = cell_B * np.cos(cell_gamma) # xz = cell_C * np.cos(cell_beta) # ly = cell_B* np.sin(cell_gamma) # yz = (cell_B*cell_C*np.cos(cell_alpha)-xy*xz)/ly # lz = np.sqrt(cell_C**2-xz**2-yz**2) # self.cell = [[lx, 0 , 0], # [xy, ly, 0 ], # [xz, yz, lz]] element_index = -1 element_dict = OrderedDict() atom_types_idx_list = [] forces_list = [] for line in force_lines[3:]: line_list = line.split() # print(line_list) if element_dict.get(line_list[1]): element_dict[line_list[1]][1] += 1 else: element_index += 1 element_dict[line_list[1]] = [element_index, 1] atom_types_idx_list.append(element_dict[line_list[1]][0]) forces_list.append( [ float(line_list[3]) * AU_TO_EV_EVERY_ANG, float(line_list[4]) * AU_TO_EV_EVERY_ANG, float(line_list[5]) * AU_TO_EV_EVERY_ANG, ] ) # print(atom_types_idx_list) # atom_names=list(element_dict.keys()) atom_names = self.atomic_kinds atom_numbs = [] GPa = PressureConversion("eV/angstrom^3", "GPa").value() if stress: stress = np.array(stress) stress = stress.astype("float64") stress = stress[np.newaxis, :, :] # stress to virial conversion, default unit in cp2k is GPa # note the stress is virial = stress * volume virial = stress * np.linalg.det(self.cell) / GPa virial = virial.squeeze() else: virial = None for ii in element_dict.keys(): atom_numbs.append(element_dict[ii][1]) # print(atom_numbs) info_dict["atom_names"] = atom_names info_dict["atom_numbs"] = atom_numbs info_dict["atom_types"] = np.asarray(atom_types_idx_list) info_dict["print_level"] = self.print_level info_dict["cells"] = np.asarray([self.cell]).astype("float64") info_dict["energies"] = np.asarray([energy]).astype("float64") info_dict["forces"] = np.asarray([forces_list]).astype("float64") if virial is not None: info_dict["virials"] = np.asarray([virial]).astype("float64") return info_dict
[docs] def handle_single_xyz_frame(self, lines): info_dict = {} atom_num = int(lines[0].strip("\n").strip()) if len(lines) != atom_num + 2: raise RuntimeError( f"format error, atom_num=={atom_num}, {len(lines)}!=atom_num+2" ) data_format_line = lines[1].strip("\n").strip() + " " prop_pattern = re.compile(r"(?P<prop>\w+)\s*=\s*(?P<number>.*?)[, ]") prop_dict = dict(prop_pattern.findall(data_format_line)) energy = 0 if prop_dict.get("E"): energy = float(prop_dict.get("E")) * AU_TO_EV # info_dict['energies'] = np.array([prop_dict['E']]).astype('float64') element_index = -1 element_dict = OrderedDict() atom_types_list = [] coords_list = [] for line in lines[2:]: line_list = line.split() if element_dict.get(line_list[0]): element_dict[line_list[0]][1] += 1 else: element_index += 1 element_dict[line_list[0]] = [element_index, 1] atom_types_list.append(element_dict[line_list[0]][0]) # coords_list.append([float(line_list[1])*AU_TO_ANG, # float(line_list[2])*AU_TO_ANG, # float(line_list[3])*AU_TO_ANG]) coords_list.append( [float(line_list[1]), float(line_list[2]), float(line_list[3])] ) atom_names = list(element_dict.keys()) atom_numbs = [] for ii in atom_names: atom_numbs.append(element_dict[ii][1]) # info_dict['atom_names'] = atom_names # info_dict['atom_numbs'] = atom_numbs # info_dict['atom_types'] = np.asarray(atom_types_list) info_dict["coords"] = np.asarray([coords_list]).astype("float64") info_dict["energies"] = np.array([energy]).astype("float64") info_dict["orig"] = np.zeros(3) return info_dict
# %%
[docs] def get_frames(fname): coord_flag = False force_flag = False stress_flag = False eV = EnergyConversion("hartree", "eV").value() angstrom = LengthConversion("bohr", "angstrom").value() GPa = PressureConversion("eV/angstrom^3", "GPa").value() atom_symbol_idx_list = [] atom_symbol_list = [] cell = [] coord = [] force = [] stress = [] fp = open(fname) # check if output is converged, if not, return sys = 0 content = fp.read() count = content.count("SCF run converged") if count == 0: fp.close() return [], [], [], [], [], [], [], None # search duplicated header fp.seek(0) header_idx = [] for idx, ii in enumerate(fp): if "Multiplication driver" in ii: header_idx.append(idx) # parse from last header fp.seek(0) for idx, ii in enumerate(fp): if idx > header_idx[-1]: if "CELL| Vector" in ii: cell.append(ii.split()[4:7]) if "Atomic kind:" in ii: atom_symbol_list.append(ii.split()[3]) # beginning of coords block if "Atom Kind Element" in ii or "Atom Kind Element" in ii: coord_flag = True # parse coords lines elif coord_flag: if ii == "\n": coord_flag = len(coord) == 0 # skip empty line at the beginning else: coord.append(ii.split()[4:7]) atom_symbol_idx_list.append(ii.split()[1]) if "ENERGY|" in ii: energy = ii.split()[8] if " Atom Kind " in ii: force_flag = True force_idx = idx if force_flag: if idx > force_idx: if "SUM OF ATOMIC FORCES" in ii: force_flag = False else: force.append(ii.split()[3:6]) # add reading stress tensor if "STRESS TENSOR [GPa" in ii: stress_flag = True stress_idx = idx if stress_flag: if idx > stress_idx + 2: if ii == "\n": stress_flag = False else: stress.append(ii.split()[1:4]) fp.close() assert coord, "cannot find coords" assert energy, "cannot find energies" assert force, "cannot find forces" # conver to float array and add extra dimension for nframes cell = np.array(cell) cell = cell.astype("float64") cell = cell[np.newaxis, :, :] coord = np.array(coord) coord = coord.astype("float64") coord = coord[np.newaxis, :, :] atom_symbol_idx_list = np.array(atom_symbol_idx_list) atom_symbol_idx_list = atom_symbol_idx_list.astype(int) atom_symbol_idx_list = atom_symbol_idx_list - 1 atom_symbol_list = np.array(atom_symbol_list) atom_symbol_list = atom_symbol_list[atom_symbol_idx_list] force = np.array(force) force = force.astype("float64") force = force[np.newaxis, :, :] # virial is not necessary if stress: stress = np.array(stress) stress = stress.astype("float64") stress = stress[np.newaxis, :, :] # stress to virial conversion, default unit in cp2k is GPa # note the stress is virial = stress * volume virial = stress * np.linalg.det(cell[0]) / GPa else: virial = None # force unit conversion, default unit in cp2k is hartree/bohr force = force * eV / angstrom # energy unit conversion, default unit in cp2k is hartree energy = float(energy) * eV energy = np.array(energy).astype("float64") energy = energy[np.newaxis] tmp_names, symbol_idx = np.unique(atom_symbol_list, return_index=True) atom_types = [] atom_numbs = [] # preserve the atom_name order atom_names = atom_symbol_list[np.sort(symbol_idx, kind="stable")] for jj in atom_symbol_list: for idx, ii in enumerate(atom_names): if jj == ii: atom_types.append(idx) for idx in range(len(atom_names)): atom_numbs.append(atom_types.count(idx)) atom_types = np.array(atom_types) return list(atom_names), atom_numbs, atom_types, cell, coord, energy, force, virial
# %%