Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLI: primitive structures, zip support, SNLs #101

Merged
merged 35 commits into from
Aug 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c52e62c
Merge branch 'bugfix_20190517' into cli
tschaume May 18, 2019
b9f7d8f
Add option to control LDAU fields
May 29, 2019
fa99e19
Silly LDA calculations !
May 30, 2019
1257d9b
Actually define potcar object
Jun 4, 2019
a5cdb60
get functional in try/except for bad task docs
Jun 5, 2019
855a873
get kpt_labels for bad task-docs
Jun 5, 2019
83278f9
Fix error on no LDAU fields
Jun 5, 2019
a3845bc
find all encompassing chemsys as well
Jun 17, 2019
99d7217
fix bugs
Jun 17, 2019
9248315
Fix piezo tensor format
Jun 20, 2019
9f80f2d
fix bug
Jun 20, 2019
c7f4f83
use structure nsites
Jul 4, 2019
7229893
limit thermo query
Jul 5, 2019
821854b
deprecate incompatible mats
Jul 6, 2019
2a1c6b8
Merge branch 'cli' of github.com:tschaume/emmet into cli
tschaume Jul 15, 2019
fe207a4
Merge remote-tracking branch 'origin/master' into cli
tschaume Jul 15, 2019
fc877c5
Merge remote-tracking branch 'origin/updates' into cli
tschaume Jul 15, 2019
313ed86
cli: primitive structure
tschaume Jul 15, 2019
0812073
thermo docs in new mat not old
Jul 16, 2019
ace49c1
ensure piezo are lists
Jul 22, 2019
ddd5056
cli load: add zip support
tschaume Jul 22, 2019
2429c27
fix bug for getting warnings
Jul 26, 2019
5153812
cli: target_snls, make snls for parse
tschaume Aug 8, 2019
8a12a8b
cli.copy: fix sbxn
tschaume Aug 12, 2019
fc4bda8
pymatgen needs networks 2.2
tschaume Aug 12, 2019
646fbfa
cli: submit restore
tschaume Aug 15, 2019
28e1a6b
cli: topdown and don't descend
tschaume Aug 16, 2019
fc58d0f
cli: cleaning up
tschaume Aug 21, 2019
ed3a5c9
cli garden_to_hpss cleanup
tschaume Aug 21, 2019
75856dd
cli.parse: update task_ids from launch dirs
tschaume Aug 21, 2019
7e04b3b
magnetism comes only from structure optimization
Aug 23, 2019
5a1daa8
default main run
Aug 26, 2019
5334511
Merge branch 'master' into updates
Aug 26, 2019
0baf7c5
cli: labels skip, bugfix
tschaume Aug 26, 2019
b08e6f6
Merge remote-tracking branch 'origin/updates' into cli
tschaume Aug 26, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions emmet/materials/mp_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,6 @@ def old_style_mat(new_style_mat):
string.ascii_uppercase[i]: float(vals[i]) for i in range(len(vals))
}
mat["initial_structure"] = new_style_mat.get("initial_structure", None)
mat["nsites"] = struc.get_primitive_structure().num_sites

set_(mat, "pseudo_potential.functional", "PBE")

Expand Down Expand Up @@ -405,7 +404,7 @@ def add_elastic(mat, new_style_mat):
else:
mat["elasticity"]["nsites"] = len(get(mat, "structure.sites"))

if get("elasticity.warnings", None) is None:
if get(new_style_mat,"elasticity.warnings") is None:
mat["elasticity"]["warnings"] = []


Expand Down Expand Up @@ -535,6 +534,9 @@ def add_thermo(mat, new_style_mat):
"""
Add's the thermo values in with sandboxing
"""
if "thermo_docs" not in new_style_mat:
mat["deprecated"] = True

if not mat["deprecated"]:
thermo = new_style_mat["thermo_docs"]

Expand Down
28 changes: 21 additions & 7 deletions emmet/materials/thermo.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,38 @@ def get_items(self):

self.logger.info("Setting indexes")
self.ensure_indicies()

# All relevant materials that have been updated since thermo props were
# last calculated
q = dict(self.query)
q.update(self.materials.lu_filter(self.thermo))
updated_comps = set(self.materials.distinct("chemsys", q))
self.logger.debug("Found {} updated chemsys".format(len(updated_comps)))

# All materials that are not present in the thermo collection
thermo_mat_ids = self.thermo.distinct("task_id")
thermo_mat_ids = self.thermo.distinct(self.thermo.key)
mat_ids = self.materials.distinct(self.materials.key,self.query)
dif_task_ids = list(set(mat_ids) - set(thermo_mat_ids))
q = dict(self.query)
q.update({"task_id": {"$nin": thermo_mat_ids}})
q.update({"task_id": {"$in": dif_task_ids}})
new_mat_comps = set(self.materials.distinct("chemsys", q))

# All chemsys not present in thermo collection
new_comps = set(self.materials.distinct("chemsys", self.query)) - set(
self.thermo.distinct("chemsys")
self.logger.debug("Found {} new materials".format(len(new_mat_comps)))

# All comps affected by changing these chemical systems
# IE if we update Li-O, we need to update Li-Mn-O, Li-Mn-P-O, etc.
affected_comps = set()
for comp in updated_comps | new_mat_comps:
els = comp.split("-")
affected_comps |= set(
self.materials.distinct("chemsys", {"elements": {"$all": els}})
)
self.logger.debug(
"Found {} chemical systems affected by this build".format(
len(affected_comps)
)
)

comps = updated_comps | new_comps | new_mat_comps
comps = updated_comps | new_mat_comps | affected_comps

# Only process maximal super sets: e.g. if ["A","B"] and ["A"]
# are both in the list, will only yield ["A","B"] as this will
Expand Down
96 changes: 59 additions & 37 deletions emmet/scripts/emmet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math, io, requests
import click, os, yaml, sys, logging, tarfile, bson, gzip, csv, tarfile, itertools, multiprocessing, math, io, requests, json
from shutil import copyfile, rmtree
from glob import glob
from fnmatch import fnmatch
Expand Down Expand Up @@ -29,6 +29,7 @@
from tqdm import tqdm
from pprint import pprint
from mongogrant.client import Client
from zipfile import ZipFile

def get_lpad():
if 'FW_CONFIG_FILE' not in os.environ:
Expand All @@ -37,7 +38,7 @@ def get_lpad():
return LaunchPad.auto_load()

exclude = {'about.remarks': {'$nin': ['DEPRECATED', 'deprecated']}}
skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+']
skip_labels = ['He', 'He0+', 'Ar', 'Ar0+', 'Ne', 'Ne0+', 'D', 'D+', 'T', 'M']
base_query = {'is_ordered': True, 'is_valid': True, 'nsites': {'$lt': 200}, 'sites.label': {'$nin': skip_labels}}
task_base_query = {'tags': {'$nin': ['DEPRECATED', 'deprecated']}, '_mpworks_meta': {'$exists': 0}}
structure_keys = ['snl_id', 'lattice', 'sites', 'charge', 'about._materialsproject.task_id']
Expand Down Expand Up @@ -150,13 +151,13 @@ def get_vasp_dirs(scan_path, base_path, max_dirs, insert):
counter = 0

# NOTE os.walk followlinks=False by default, as intended here
for root, dirs, files in os.walk(scan_path):
# TODO ignore relax1/2 subdirs if INCAR.orig found
for root, dirs, files in os.walk(scan_path, topdown=True):
if contains_vasp_dirs(files):
yield get_symlinked_path(root, base_path_index, insert)
counter += 1
if counter >= max_dirs:
break
dirs[:] = [] # don't descend further (i.e. ignore relax1/2)
else:
for f in files:
if f.endswith('.tar.gz'):
Expand All @@ -178,6 +179,7 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
lpad = get_lpad()
target = calcdb_from_mgrant(f'{lpad.host}/{lpad.name}')
print(name, 'connected to target db with', target.collection.count(), 'tasks')
input_structures = []

for vaspdir in vaspdirs:
if get_subdir(vaspdir) in already_inserted_subdirs:
Expand Down Expand Up @@ -207,6 +209,9 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
print(name, 'removed', vaspdir)
continue

s = Structure.from_dict(task_doc['input']['structure'])
input_structures.append(s)

q = {'dir_name': {'$regex': get_subdir(vaspdir)}}
# check completed_at timestamp to decide on re-parse (only relevant for --force)
docs = list(target.collection.find(q, {'completed_at': 1}).sort([('_id', -1)]).limit(1))
Expand All @@ -233,9 +238,8 @@ def parse_vasp_dirs(vaspdirs, insert, drone, already_inserted_subdirs):
task_doc['calcs_reversed'][0]['output'].pop('force_constants')
target.insert_task(task_doc, use_gridfs=True)

nr_vaspdirs = len(vaspdirs)
print(name, 'processed', nr_vaspdirs, 'VASP directories')
return nr_vaspdirs
print(name, 'processed', len(vaspdirs), 'VASP directories -', len(input_structures), 'structures')
return input_structures

@click.group()
def cli():
Expand Down Expand Up @@ -298,6 +302,8 @@ def copy(target_spec, tag, insert, copy_snls, sbxn, src, force):
target = calcdb_from_mgrant(target_spec)
print('connected to target db with', target.collection.count(), 'tasks')

sbxn = list(sbxn) if sbxn else target.collection.distinct('sbxn')

ensure_indexes(['task_id', 'tags', 'dir_name', 'retired_task_id'], [source.collection, target.collection])

tags = [tag]
Expand Down Expand Up @@ -326,11 +332,13 @@ def copy(target_spec, tag, insert, copy_snls, sbxn, src, force):
if counter:
print(counter, 'year tags fixed.')

target_snls = target.db.snls_user

def insert_snls(snls_list):
if snls_list:
print('copy', len(snls_list), 'SNLs')
if insert:
result = target.db.snls.insert_many(snls_list)
result = target_snls.insert_many(snls_list)
print('#SNLs inserted:', len(result.inserted_ids))
snls_list.clear()
else:
Expand Down Expand Up @@ -363,7 +371,7 @@ def insert_snls(snls_list):
snl_copied = False
try:
q = {'about.projects': t, '$or': [{k: formula} for k in aggregation_keys]}
group = aggregate_by_formula(target.db.snls, q).next() # only one formula
group = aggregate_by_formula(target_snls, q).next() # only one formula
for dct in group['structures']:
existing_structure = Structure.from_dict(dct)
if structures_match(snl.structure, existing_structure):
Expand All @@ -376,7 +384,7 @@ def insert_snls(snls_list):
continue
snl_dct = snl.as_dict()
if index is None:
index = max([int(snl_id[len(prefix)+1:]) for snl_id in target.db.snls.distinct('snl_id')]) + 1
index = max([int(snl_id[len(prefix)+1:]) for snl_id in target_snls.distinct('snl_id')]) + 1
else:
index += 1
snl_id = '{}-{}'.format(prefix, index)
Expand Down Expand Up @@ -495,8 +503,7 @@ def insert_snls(snls_list):
print('ERROR: not a SO task!')
continue

if sbxn:
task_doc['sbxn'] = list(sbxn)
task_doc['sbxn'] = sbxn

if insert:
target.insert_task(task_doc, use_gridfs=True)
Expand Down Expand Up @@ -809,7 +816,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
continue
mongo_handler.collection.remove(q) # avoid dups
counter['structures'] += 1
s = Structure.from_dict(dct)
s = Structure.from_dict(dct).get_primitive_structure()
s.snl_id = dct['snl_id']
s.task_id = dct.get('task_id')
try:
Expand Down Expand Up @@ -1024,7 +1031,7 @@ def find_matching_canonical_task_structures(formula, struct, full_name):
print(len(canonical_structures_list), 'canonical structure(s) for', formula, sites_elements)
if tag is not None:
print('trying again ...')
wflows(add_snlcolls, add_tasks_db, tag, insert, clear_logs, max_structures, True)
wflows(add_snlcolls, add_taskdbs, tag, insert, clear_logs, max_structures, True, force_new)

print(counter)

Expand Down Expand Up @@ -1118,6 +1125,13 @@ def report(tag, in_progress, to_csv):
for row in table._get_rows(options):
writer.writerow(row)

def get_format(fname):
if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
return 'cif'
elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
return 'json'
else:
raise ValueError('reading', fname, 'not supported (yet)')

@cli.command()
@click.argument('archive', type=click.Path(exists=True))
Expand All @@ -1134,7 +1148,7 @@ def load(archive, add_snlcolls, insert):
tag, sec_ext = fname.rsplit('.', 1) if '.' in fname else [fname, '']
if sec_ext:
ext = ''.join([sec_ext, ext])
exts = ['tar.gz', '.tgz', 'bson.gz']
exts = ['tar.gz', '.tgz', 'bson.gz', '.zip']
if ext not in exts:
print(ext, 'not supported (yet)! Please use one of', exts)
return
Expand All @@ -1148,6 +1162,16 @@ def load(archive, add_snlcolls, insert):
if any([bool(l in elements) for l in skip_labels]):
continue
input_structures.append(TransformedStructure.from_dict(doc['structure']))
elif ext == '.zip':
input_zip = ZipFile(archive)
for fname in input_zip.namelist():
contents = input_zip.read(fname)
fmt = get_format(fname)
try:
input_structures.append(Structure.from_str(contents, fmt=fmt))
except Exception as ex:
print(ex)
break #continue
else:
tar = tarfile.open(archive, 'r:gz')
for member in tar.getmembers():
Expand All @@ -1157,13 +1181,7 @@ def load(archive, add_snlcolls, insert):
if f:
contents = f.read().decode('utf-8')
fname = member.name.lower()
if fnmatch(fname, "*.cif*") or fnmatch(fname, "*.mcif*"):
fmt = 'cif'
elif fnmatch(fname, "*.json*") or fnmatch(fname, "*.mson*"):
fmt = 'json'
else:
print('reading', fname, 'not supported (yet)')
continue
fmt = get_format(fname)
try:
input_structures.append(Structure.from_str(contents, fmt=fmt))
except Exception as ex:
Expand Down Expand Up @@ -1301,9 +1319,9 @@ def insert_snls(snls_list):
@click.option('--nproc', '-n', type=int, default=1, help='number of processes for parallel parsing')
@click.option('--max-dirs', '-m', type=int, default=10, help='maximum number of directories to parse')
@click.option('--force/--no-force', default=False, help='force re-parsing of task')
#@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
#@click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks')
def parse(base_path, insert, nproc, max_dirs, force):#, add_snlcolls, make_snls):
@click.option('--add_snlcolls', '-a', type=click.Path(exists=True), help='YAML config file with multiple documents defining additional SNLs collections to scan')
@click.option('--make-snls/--no-make-snls', default=False, help='also create SNLs for parsed tasks')
def parse(base_path, insert, nproc, max_dirs, force, add_snlcolls, make_snls):
"""parse VASP output directories in base_path into tasks and tag"""
if not insert:
print('DRY RUN: add --insert flag to actually insert tasks')
Expand All @@ -1330,7 +1348,7 @@ def parse(base_path, insert, nproc, max_dirs, force):#, add_snlcolls, make_snls)
iterator_vaspdirs = get_vasp_dirs(base_path, base_path, max_dirs, insert)
iterator = iterator_slice(iterator_vaspdirs, chunk_size) # process in chunks
queue = deque()
total_nr_vaspdirs_parsed = 0
input_structures = []

while iterator or queue:
try:
Expand All @@ -1344,19 +1362,23 @@ def parse(base_path, insert, nproc, max_dirs, force):#, add_snlcolls, make_snls)
if not process.ready():
queue.append(process)
else:
total_nr_vaspdirs_parsed += process.get()
input_structures += process.get()

pool.close()
print('DONE:', total_nr_vaspdirs_parsed, 'parsed')

#input_structures = []
# if make_snls:
# s = Structure.from_dict(task_doc['input']['structure'])
# input_structures.append(s)

#if insert and make_snls:
# print('add SNLs for', len(input_structures), 'structures')
# add_snls(tag, input_structures, add_snlcolls, insert)
print('DONE:', len(input_structures), 'structures')

fn = os.path.join(base_path, 'launchdir_to_taskid.json')
if os.path.exists(fn):
print('updating task ids...')
with open(fn, 'r') as f:
launchdir_to_taskid = json.load(f)
for doc in target.collection.find({'tags': tag}, {'dir_name': 1, 'task_id': 1, '_id': 0}):
task_id = launchdir_to_taskid[get_subdir(doc['dir_name'])]
target.collection.update({'task_id': doc['task_id']}, {'$set': {'task_id': task_id}})

if insert and make_snls:
print('add SNLs for', len(input_structures), 'structures')
add_snls(tag, input_structures, add_snlcolls, insert)

def upload_archive(path, name, service, parent=None):
media = MediaFileUpload(path, mimetype='application/gzip', resumable=True)
Expand Down
47 changes: 38 additions & 9 deletions emmet/scripts/garden_to_hpss.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,44 @@
#!/bin/bash

cd $1 && pwd
if [ "$#" -ne 2 ]; then
echo "Usage: $0 DIRECTORY FILTER"
exit 1
fi

for block_dir in `find $1 -maxdepth 1 -type d -name "block_*"`; do
echo $block_dir
parallel -0m 'chmod -v g+rw {}' :::: <(find $block_dir -not -perm -660 -print0)
indir=$1
filter=$2
cd $indir && pwd

for block in $(find . -maxdepth 1 -type d -name "$filter" -exec basename {} \;); do
echo $block
[[ ! -d $block ]] && echo $block does not exist && exit
find $block -type d -empty -print -delete
[[ ! -d $block ]] && echo $block only contained empty directories && exit

parallel -0m 'chmod -v g+rw {}' :::: <(find $block -not -perm -660 -print0)
[[ $? -ne 0 ]] && echo 'error in chmod' && exit
find $block_dir -type f -not -name "*.gz" -exec pigz -9v {} \;
find $block -type f -not -name "*.gz" -exec pigz -9v {} \;
[[ $? -ne 0 ]] && echo "error in pigz" && exit
block=`basename $block_dir`
htar -M 5000000 -cvf garden/${block}.tar $block
[[ $? -ne 0 ]] && echo "error with htar" && exit
rm -rfv $block_dir

htar -vtf garden/${block}.tar | awk '{ print $7 }' | sort -u > ${block}.tar.idx
[[ $? -ne 0 ]] && echo "error in htar -t" && exit # TODO upload new archive if not exists
find $block -type f | sort -u > ${block}.idx

comm -13 ${block}.tar.idx ${block}.idx > ${block}.missing
if [ -s ${block}.missing ]; then
nfiles=$(wc -l ${block}.missing | awk '{ print $1}')
echo need syncing of $nfiles files
htar -xvf garden/${block}.tar
[[ $? -ne 0 ]] && echo "error in htar -x" && exit
hsi -q -l matcomp mv garden/${block}.tar garden/${block}.tar.bkp
hsi -q -l matcomp mv garden/${block}.tar.idx garden/${block}.tar.idx.bkp
htar -M 5000000 -cvf garden/${block}.tar ${block}
[[ $? -ne 0 ]] && echo "error in htar -c" && exit
hsi -q -l matcomp rm garden/${block}.tar*.bkp
[[ $? -ne 0 ]] && echo 'error in htar rm' && exit
else
echo all files already in HTAR archive
fi
rm -rv ${block}
rm -v ${block}.tar.idx ${block}.idx ${block}.missing
done
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
#SBATCH --mail-type=ALL
#SBATCH --output=garden_to_hpss-%j.out
#SBATCH --error=garden_to_hpss-%j.error
#SBATCH --mem=10GB

script=$HOME/mp_prod/codes/emmet/emmet/scripts/garden_to_hpss.sh
indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw/
$script $indir
#indir=/project/projectdirs/matgen/garden/hpss_to_mpdrive/raw
indir=/project/projectdirs/matgen/garden/MatProj
filter="block_2011-*"

$script $indir $filter
Loading