Source code for dpgen.collect.collect

#!/usr/bin/env python3

import argparse
import glob
import json
import os

import dpdata

from dpgen.generator.run import data_system_fmt
from dpgen.util import expand_sys_str


[docs] def collect_data( target_folder, param_file, output, verbose=True, shuffle=True, merge=True ): target_folder = os.path.abspath(target_folder) output = os.path.abspath(output) # goto input cwd = os.getcwd() os.chdir(target_folder) with open(param_file) as fp: jdata = json.load(fp) sys_configs_prefix = jdata.get("sys_configs_prefix", "") sys_configs = jdata.get("sys_configs", []) if verbose: max_str_len = max([len(str(ii)) for ii in sys_configs]) max_form_len = 16 ptr_fmt = "%%%ds %%%ds natoms %%6d nframes %%6d" % ( max_str_len + 5, max_form_len, ) # init systems init_data = [] init_data_prefix = jdata.get("init_data_prefix", "") init_data_sys = jdata.get("init_data_sys", []) for ii in init_data_sys: init_data.append( dpdata.LabeledSystem(os.path.join(init_data_prefix, ii), fmt="deepmd/npy") ) # collect systems from iter dirs coll_data = {} numb_sys = len(sys_configs) model_devi_jobs = jdata.get("model_devi_jobs", {}) numb_jobs = len(model_devi_jobs) iters = ["iter.%06d" % ii for ii in range(numb_jobs)] # loop over iters to collect data for ii in range(len(iters)): iter_data = glob.glob(os.path.join(iters[ii], "02.fp", "data.[0-9]*[0-9]")) iter_data.sort() iter_data = sum([expand_sys_str(ii) for ii in iter_data], []) for jj in iter_data: sys = dpdata.LabeledSystem(jj, fmt="deepmd/npy") if merge: sys_str = sys.formula else: sys_str = os.path.basename(jj).split(".")[-1] if sys_str in coll_data.keys(): coll_data[sys_str].append(sys) else: coll_data[sys_str] = sys # print information if verbose: for ii in range(len(init_data)): print( ptr_fmt % ( str(init_data_sys[ii]), init_data[ii].formula, init_data[ii].get_natoms(), init_data[ii].get_nframes(), ) ) keys = list(coll_data.keys()) keys.sort() for ii in keys: if merge: sys_str = ii else: sys_str = str(sys_configs[int(ii)]) print( ptr_fmt % ( sys_str, coll_data[ii].formula, coll_data[ii].get_natoms(), coll_data[ii].get_nframes(), ) ) # shuffle system data if shuffle: for kk in coll_data.keys(): coll_data[kk].shuffle() # create output dir os.chdir(cwd) os.makedirs(output, exist_ok=True) # dump init data for idx, ii in enumerate(init_data): out_dir = "init." + (data_system_fmt % idx) ii.to("deepmd/npy", os.path.join(output, out_dir)) # dump iter data for kk in coll_data.keys(): out_dir = "sys.%s" % kk nframes = coll_data[kk].get_nframes() coll_data[kk].to("deepmd/npy", os.path.join(output, out_dir), set_size=nframes)
# coll_data[kk].to('deepmd/npy', os.path.join(output, out_dir))
[docs] def gen_collect(args): collect_data( args.JOB_DIR, args.parameter, args.OUTPUT, verbose=args.verbose, shuffle=args.shuffle, merge=args.merge, )
def _main(): parser = argparse.ArgumentParser(description="Collect data from DP-GEN iterations") parser.add_argument("JOB_DIR", type=str, help="the directory of the DP-GEN job") parser.add_argument("OUTPUT", type=str, help="the output directory of data") parser.add_argument( "-p", "--parameter", type=str, default="param.json", help="the json file provides DP-GEN paramters, should be located in JOB_DIR", ) parser.add_argument( "-v", "--verbose", action="store_true", help="print number of data in each system", ) parser.add_argument( "-m", "--merge", action="store_true", help="merge the systems with the same chemical formula", ) parser.add_argument( "-s", "--shuffle", action="store_true", help="shuffle the data systems" ) args = parser.parse_args() gen_collect(args) if __name__ == "__main__": _main()