#!/usr/bin/env python3
import argparse
import glob
import json
import os
import subprocess as sp
[docs]
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
[docs]
def collect_data(target_folder, param_file, output, verbose=True):
target_folder = os.path.abspath(target_folder)
output = os.path.abspath(output)
tool_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "..", "template"
)
command_cvt_2_raw = os.path.join(tool_path, "tools.vasp", "convert2raw.py")
command_cvt_2_raw += " data.configs"
command_shuffle_raw = os.path.join(tool_path, "tools.raw", "shuffle_raw.py")
command_raw_2_set = os.path.join(tool_path, "tools.raw", "raw_to_set.sh")
# goto input
cwd = os.getcwd()
os.chdir(target_folder)
jdata = json.load(open(param_file))
sys = jdata["sys_configs"]
if verbose:
max_str_len = max([len(str(ii)) for ii in sys])
ptr_fmt = "%%%ds %%6d" % (max_str_len + 5)
# collect systems from iter dirs
coll_sys = [[] for ii in sys]
numb_sys = len(sys)
iters = glob.glob("iter.[0-9]*[0-9]")
iters.sort()
for ii in iters:
iter_data = glob.glob(os.path.join(ii, "02.fp", "data.[0-9]*[0-9]"))
iter_data.sort()
for jj in iter_data:
sys_idx = int(os.path.basename(jj).split(".")[-1])
coll_sys[sys_idx].append(jj)
# create output dir
os.makedirs(output, exist_ok=True)
# loop over systems
for idx, ii in enumerate(coll_sys):
if len(ii) == 0:
continue
# link iter data dirs
out_sys_path = os.path.join(output, "system.%03d" % idx)
os.makedirs(out_sys_path, exist_ok=True)
cwd_ = os.getcwd()
os.chdir(out_sys_path)
for jj in ii:
in_sys_path = os.path.join(target_folder, jj)
in_iter = in_sys_path.split("/")[-3]
in_base = in_sys_path.split("/")[-1]
out_file = in_iter + "." + in_base
if os.path.exists(out_file):
os.remove(out_file)
os.symlink(in_sys_path, out_file)
# cat data.configs
data_configs = glob.glob(
os.path.join("iter.[0-9]*[0-9].data.[0-9]*[0-9]", "orig", "data.configs")
)
data_configs.sort()
os.makedirs("orig", exist_ok=True)
with open(os.path.join("orig", "data.configs"), "w") as outfile:
for fname in data_configs:
with open(fname) as infile:
outfile.write(infile.read())
# convert to raw
os.chdir("orig")
sp.check_call(command_cvt_2_raw, shell=True)
os.chdir("..")
# shuffle raw
sp.check_call(command_shuffle_raw + " orig " + " . > /dev/null", shell=True)
if os.path.exists("type.raw"):
os.remove("type.raw")
os.symlink(os.path.join("orig", "type.raw"), "type.raw")
# raw to sets
sp.check_call(command_raw_2_set + " > /dev/null", shell=True)
# print summary
if verbose:
ndata = file_len("box.raw")
print(ptr_fmt % (str(sys[idx]), ndata))
# ch dir
os.chdir(cwd_)
def _main():
parser = argparse.ArgumentParser(description="Collect data from DP-GEN iterations")
parser.add_argument("JOB_DIR", type=str, help="the directory of the DP-GEN job")
parser.add_argument("OUTPUT", type=str, help="the output directory of data")
parser.add_argument(
"-p",
"--parameter",
type=str,
default="param.json",
help="the json file provides DP-GEN paramters, should be located in JOB_DIR",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="print number of data in each system",
)
args = parser.parse_args()
collect_data(args.JOB_DIR, args.parameter, args.OUTPUT, args.verbose)
if __name__ == "__main__":
_main()