Source code for dpgen.tools.collect_data

#!/usr/bin/env python3

import argparse
import glob
import json
import os
import subprocess as sp


[docs] def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1
[docs] def collect_data(target_folder, param_file, output, verbose=True): target_folder = os.path.abspath(target_folder) output = os.path.abspath(output) tool_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "template" ) command_cvt_2_raw = os.path.join(tool_path, "tools.vasp", "convert2raw.py") command_cvt_2_raw += " data.configs" command_shuffle_raw = os.path.join(tool_path, "tools.raw", "shuffle_raw.py") command_raw_2_set = os.path.join(tool_path, "tools.raw", "raw_to_set.sh") # goto input cwd = os.getcwd() os.chdir(target_folder) jdata = json.load(open(param_file)) sys = jdata["sys_configs"] if verbose: max_str_len = max([len(str(ii)) for ii in sys]) ptr_fmt = "%%%ds %%6d" % (max_str_len + 5) # collect systems from iter dirs coll_sys = [[] for ii in sys] numb_sys = len(sys) iters = glob.glob("iter.[0-9]*[0-9]") iters.sort() for ii in iters: iter_data = glob.glob(os.path.join(ii, "02.fp", "data.[0-9]*[0-9]")) iter_data.sort() for jj in iter_data: sys_idx = int(os.path.basename(jj).split(".")[-1]) coll_sys[sys_idx].append(jj) # create output dir os.makedirs(output, exist_ok=True) # loop over systems for idx, ii in enumerate(coll_sys): if len(ii) == 0: continue # link iter data dirs out_sys_path = os.path.join(output, "system.%03d" % idx) os.makedirs(out_sys_path, exist_ok=True) cwd_ = os.getcwd() os.chdir(out_sys_path) for jj in ii: in_sys_path = os.path.join(target_folder, jj) in_iter = in_sys_path.split("/")[-3] in_base = in_sys_path.split("/")[-1] out_file = in_iter + "." + in_base if os.path.exists(out_file): os.remove(out_file) os.symlink(in_sys_path, out_file) # cat data.configs data_configs = glob.glob( os.path.join("iter.[0-9]*[0-9].data.[0-9]*[0-9]", "orig", "data.configs") ) data_configs.sort() os.makedirs("orig", exist_ok=True) with open(os.path.join("orig", "data.configs"), "w") as outfile: for fname in data_configs: with open(fname) as infile: outfile.write(infile.read()) # convert to raw os.chdir("orig") sp.check_call(command_cvt_2_raw, shell=True) os.chdir("..") # shuffle raw sp.check_call(command_shuffle_raw + " orig " + " . > /dev/null", shell=True) if os.path.exists("type.raw"): os.remove("type.raw") os.symlink(os.path.join("orig", "type.raw"), "type.raw") # raw to sets sp.check_call(command_raw_2_set + " > /dev/null", shell=True) # print summary if verbose: ndata = file_len("box.raw") print(ptr_fmt % (str(sys[idx]), ndata)) # ch dir os.chdir(cwd_)
def _main(): parser = argparse.ArgumentParser(description="Collect data from DP-GEN iterations") parser.add_argument("JOB_DIR", type=str, help="the directory of the DP-GEN job") parser.add_argument("OUTPUT", type=str, help="the output directory of data") parser.add_argument( "-p", "--parameter", type=str, default="param.json", help="the json file provides DP-GEN paramters, should be located in JOB_DIR", ) parser.add_argument( "-v", "--verbose", action="store_true", help="print number of data in each system", ) args = parser.parse_args() collect_data(args.JOB_DIR, args.parameter, args.OUTPUT, args.verbose) if __name__ == "__main__": _main()