Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ASE ingester and generalize other ingestion utilities #1509

Merged
merged 6 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
<img width="100px" align="center" src="https://matsci.org/uploads/default/original/2X/b/bd2f59b3bf14fb046b74538750699d7da4c19ac1.svg">
</div>

<h1 align="center">
OPTIMADE Python tools
</h1>

# <div align="center">OPTIMADE Python tools</div>

<div align="center">

Expand Down Expand Up @@ -50,6 +47,7 @@ This is to enable interoperability among databases that serve crystal structures
This repository contains a library of tools for implementing and consuming [OPTIMADE APIs](https://www.optimade.org) using Python:

1. [pydantic](https://github.com/pydantic/pydantic) data models for all [OPTIMADE entry types](https://www.optimade.org/optimade-python-tools/latest/all_models/) and endpoint responses, and a [Lark](https://github.com/lark-parser/lark) [EBNF grammar](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form) implementation for the OPTIMADE filter language.
1. Adapters to map OPTIMADE data to and from many commonly used atomistic Python frameworks (e.g., [pymatgen](https://pymatgen.org/), [ASE](https://wiki.fysik.dtu.dk/ase/)) and crystallographic file types (e.g., [CIF](https://www.iucr.org/resources/cif)), using the `optimade.adapters` module.
1. A configurable reference server implementation that can make use of either MongoDB or Elasticsearch database backends out-of-the-box, and is readily extensible to other backends. Try it out on the [demo site](https://optimade.fly.dev)! The OpenAPI schemas of the server are used to construct the [OPTIMADE schemas](https://schemas.optimade.org/) site.
1. An [OPTIMADE client](https://www.optimade.org/optimade-python-tools/latest/getting_started/client/) (`optimade-get`) that can query multiple [OPTIMADE providers](https://optimade.org/providers-dashboard) concurrently with a given filter, at the command-line or from Python code.
1. A fuzzy API validator tool, which may be called from the shell (`optimade-validator`) or used as a GitHub Action from [optimade-validator-action](https://github.com/Materials-Consortia/optimade-validator-action); this validator is used to construct the [providers dashboard](https://optimade.org/providers-dashboard).
Expand Down
46 changes: 46 additions & 0 deletions optimade/adapters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class EntryAdapter:
Attributes:
ENTRY_RESOURCE: Entry resource to store entry as.
_type_converters: Dictionary of valid conversion types for entry.
_type_ingesters: Dictionary of valid ingestion types mapped to ingestion functions.
_type_ingesters_by_type: Dictionary mapping the keys of `_type_ingesters` to data
types that can be ingested.
as_<_type_converters>: Convert entry to a type listed in `_type_converters`.
from_<_type_converters>: Convert an external type to the corresponding OPTIMADE model.

Expand All @@ -42,6 +45,7 @@ class EntryAdapter:
ENTRY_RESOURCE: Type[EntryResource] = EntryResource
_type_converters: Dict[str, Callable] = {}
_type_ingesters: Dict[str, Callable] = {}
_type_ingesters_by_type: Dict[str, Type] = {}

def __init__(self, entry: dict) -> None:
"""
Expand Down Expand Up @@ -116,6 +120,48 @@ def convert(self, format: str) -> Any:

return self._converted[format]

@classmethod
def ingest_from(cls, data: Any, format: Optional[str] = None) -> Any:
"""Convert desired format to OPTIMADE format.

Parameters:
data (Any): The data to convert.
format (str): Type or format to which the entry should be converted.

Raises:
AttributeError: If `format` can not be found in `_type_ingesters`.

Returns:
The ingested Structure.

"""

if format is None:
for key, instance_type in cls._type_ingesters_by_type.items():
if isinstance(data, instance_type):
format = key
break

else:
raise AttributeError(
f"Non entry type to data of type {type(data)} from.\n"
f"Valid entry types: {tuple(cls._type_ingesters.keys())}"
)

if format not in cls._type_ingesters:
raise AttributeError(
f"Non-valid entry type to ingest from: {format}\n"
f"Valid entry types: {tuple(cls._type_ingesters.keys())}"
)
ml-evs marked this conversation as resolved.
Show resolved Hide resolved

return cls(
{
"attributes": cls._type_ingesters[format](data).dict(),
"id": "",
"type": "structures",
}
)

@staticmethod
def _get_model_attributes(
starting_instances: Union[Tuple[BaseModel, ...], List[BaseModel]], name: str
Expand Down
10 changes: 9 additions & 1 deletion optimade/adapters/structures/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from optimade.models import StructureResource

from .aiida import get_aiida_structure_data
from .ase import get_ase_atoms
from .ase import Atoms as ASEAtoms
from .ase import from_ase_atoms, get_ase_atoms
from .cif import get_cif
from .jarvis import get_jarvis_atoms
from .proteindatabank import get_pdb, get_pdbx_mmcif
from .pymatgen import Structure as PymatgenStructure
from .pymatgen import from_pymatgen, get_pymatgen


Expand Down Expand Up @@ -55,4 +57,10 @@ class Structure(EntryAdapter):

_type_ingesters: Dict[str, Callable] = {
"pymatgen": from_pymatgen,
"ase": from_ase_atoms,
}

_type_ingesters_by_type: Dict[str, Type] = {
"pymatgen": PymatgenStructure,
"ase": ASEAtoms,
JPBergsma marked this conversation as resolved.
Show resolved Hide resolved
}
58 changes: 56 additions & 2 deletions optimade/adapters/structures/ase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@
from typing import Dict

from optimade.adapters.exceptions import ConversionError
from optimade.adapters.structures.utils import species_from_species_at_sites
from optimade.adapters.structures.utils import (
elements_ratios_from_species_at_sites,
species_from_species_at_sites,
)
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureFeatures
from optimade.models import StructureResource as OptimadeStructure
from optimade.models.structures import StructureResourceAttributes
from optimade.models.utils import anonymize_formula, reduce_formula

try:
from ase import Atom, Atoms
Expand All @@ -26,7 +31,7 @@
ASE_NOT_FOUND = "ASE not found, cannot convert structure to an ASE Atoms"


__all__ = ("get_ase_atoms",)
__all__ = ("get_ase_atoms", "from_ase_atoms")


def get_ase_atoms(optimade_structure: OptimadeStructure) -> Atoms:
Expand Down Expand Up @@ -85,3 +90,52 @@ def get_ase_atoms(optimade_structure: OptimadeStructure) -> Atoms:
return Atoms(
symbols=atoms, cell=attributes.lattice_vectors, pbc=attributes.dimension_types
)


def from_ase_atoms(atoms: Atoms) -> StructureResourceAttributes:
"""Convert an ASE `Atoms` object into an OPTIMADE `StructureResourceAttributes` model.

Parameters:
atoms: The ASE `Atoms` object to convert.

Returns:
An OPTIMADE `StructureResourceAttributes` model, which can be converted to a raw Python
dictionary with `.dict()` or to JSON with `.json()`.

"""
if not isinstance(atoms, Atoms):
raise RuntimeError(
f"Cannot convert type {type(atoms)} into an OPTIMADE `StructureResourceAttributes` model."
)

attributes = {}
attributes["cartesian_site_positions"] = atoms.positions.tolist()
attributes["lattice_vectors"] = atoms.cell.tolist()
attributes["species_at_sites"] = atoms.get_chemical_symbols()
attributes["elements_ratios"] = elements_ratios_from_species_at_sites(
attributes["species_at_sites"]
)
attributes["species"] = species_from_species_at_sites(
attributes["species_at_sites"]
)
attributes["dimension_types"] = [int(_) for _ in atoms.pbc.tolist()]
attributes["nperiodic_dimensions"] = sum(attributes["dimension_types"])
attributes["nelements"] = len(attributes["species"])
attributes["elements"] = sorted([_.name for _ in attributes["species"]])
attributes["nsites"] = len(attributes["species_at_sites"])

attributes["chemical_formula_descriptive"] = atoms.get_chemical_formula()
attributes["chemical_formula_reduced"] = reduce_formula(
atoms.get_chemical_formula()
)
attributes["chemical_formula_anonymous"] = anonymize_formula(
attributes["chemical_formula_reduced"],
)
attributes["last_modified"] = None
attributes["immutable_id"] = None
attributes["structure_features"] = []

for key in atoms.info:
attributes[f"_ase_{key}".lower()] = atoms.info[key]

return StructureResourceAttributes(**attributes)
41 changes: 6 additions & 35 deletions optimade/adapters/structures/pymatgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
from optimade.models import Species as OptimadeStructureSpecies
from optimade.models import StructureResource as OptimadeStructure
from optimade.models import StructureResourceAttributes
from optimade.models.utils import anonymize_formula, reduce_formula

try:
from pymatgen.core import Composition, Lattice, Molecule, Structure
from pymatgen.core import Lattice, Molecule, Structure

except (ImportError, ModuleNotFoundError):
from warnings import warn
Expand Down Expand Up @@ -168,14 +169,14 @@ def from_pymatgen(pmg_structure: Structure) -> StructureResourceAttributes:
attributes["dimension_types"] = [int(_) for _ in pmg_structure.lattice.pbc]
attributes["nperiodic_dimensions"] = sum(attributes["dimension_types"])
attributes["nelements"] = len(pmg_structure.composition.elements)
attributes["chemical_formula_anonymous"] = _pymatgen_anonymized_formula_to_optimade(
pmg_structure.composition
attributes["chemical_formula_anonymous"] = anonymize_formula(
pmg_structure.composition.formula
)
attributes["elements"] = sorted(
[_.symbol for _ in pmg_structure.composition.elements]
)
attributes["chemical_formula_reduced"] = _pymatgen_reduced_formula_to_optimade(
pmg_structure.composition
attributes["chemical_formula_reduced"] = reduce_formula(
pmg_structure.composition.formula
)
attributes["chemical_formula_descriptive"] = pmg_structure.composition.formula
attributes["elements_ratios"] = [
Expand All @@ -188,33 +189,3 @@ def from_pymatgen(pmg_structure: Structure) -> StructureResourceAttributes:
attributes["structure_features"] = []

return StructureResourceAttributes(**attributes)


def _pymatgen_anonymized_formula_to_optimade(composition: Composition) -> str:
"""Construct an OPTIMADE `chemical_formula_anonymous` from a pymatgen `Composition`."""
import re

from optimade.models.utils import anonymous_element_generator

return "".join(
[
"".join(x)
for x in zip(
anonymous_element_generator(),
reversed(re.split("[A-Z]", composition.anonymized_formula)[1:]),
)
]
)


def _pymatgen_reduced_formula_to_optimade(composition: Composition) -> str:
"""Construct an OPTIMADE `chemical_formula_reduced` from a pymatgen `Composition`."""
import numpy

numbers = [int(_) for _ in composition.to_reduced_dict.values()]
gcd = numpy.gcd.reduce(numbers)
return "".join(
_
+ f"{int(composition.to_reduced_dict[_]) // gcd if composition.to_reduced_dict[_] // gcd > 1 else ''}"
for _ in sorted([_.symbol for _ in composition.elements])
)
11 changes: 11 additions & 0 deletions optimade/adapters/structures/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,3 +355,14 @@ def species_from_species_at_sites(
OptimadeStructureSpecies(name=_, concentration=[1.0], chemical_symbols=[_])
for _ in set(species_at_sites)
]


def elements_ratios_from_species_at_sites(species_at_sites: List[str]) -> List[float]:
"""Compute the OPTIMADE `elements_ratios` field from `species_at_sites` in the case where `species_at_sites` refers
to sites wholly occupied by the given elements, e.g., not arbitrary species labels or with partial/mixed occupancy.

"""
elements = set(species_at_sites)
counts = {e: species_at_sites.count(e) for e in elements}
num_sites = len(species_at_sites)
return [counts[e] / num_sites for e in sorted(elements)]
18 changes: 4 additions & 14 deletions optimade/models/structures.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
# pylint: disable=no-self-argument,line-too-long,no-name-in-module
import math
import re
import sys
import warnings
from enum import Enum, IntEnum
from functools import reduce
from typing import List, Optional, Union

from pydantic import BaseModel, conlist, root_validator, validator
Expand All @@ -18,6 +15,7 @@
OptimadeField,
StrictField,
SupportLevel,
reduce_formula,
)
from optimade.warnings import MissingExpectedField

Expand Down Expand Up @@ -895,18 +893,10 @@ def check_reduced_formulae(cls, value, field):
if value is None:
return value

numbers = [n.strip() or 1 for n in re.split(r"[A-Z][a-z]*", value)]
# Need to remove leading 1 from split and convert to ints
numbers = [int(n) for n in numbers[1:]]

if sys.version_info[1] >= 9:
gcd = math.gcd(*numbers)
else:
gcd = reduce(math.gcd, numbers)

if gcd != 1:
reduced_formula = reduce_formula(value)
if reduced_formula != value:
raise ValueError(
f"{field.name} {value!r} is not properly reduced: greatest common divisor was {gcd}, expected 1."
f"{field.name} {value!r} is not properly reduced: expected {reduced_formula!r}."
)

return value
Expand Down
61 changes: 60 additions & 1 deletion optimade/models/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import inspect
import itertools
import math
import re
import warnings
from enum import Enum
from typing import TYPE_CHECKING, Optional
from functools import reduce
from typing import TYPE_CHECKING, List, Optional

from pydantic import Field
from pydantic.fields import FieldInfo
Expand Down Expand Up @@ -228,6 +230,63 @@ def anonymous_element_generator():
yield "".join(s)


def _reduce_or_anonymize_formula(
formula: str, alphabetize: bool = True, anonymize: bool = False
) -> str:
"""Takes an input formula, reduces it and either alphabetizes or anonymizes it."""
import re
import sys

numbers: List[int] = [
int(n.strip() or 1) for n in re.split(r"[A-Z][a-z]*", formula)[1:]
]
# Need to remove leading 1 from split and convert to ints

species = re.findall("[A-Z][a-z]*", formula)

if sys.version_info[1] >= 9:
gcd = math.gcd(*numbers)
else:
gcd = reduce(math.gcd, numbers)

if not len(species) == len(numbers):
raise ValueError(f"Something is wrong with the input formula: {formula}")

numbers = [n // gcd for n in numbers]

if anonymize:
numbers = sorted(numbers, reverse=True)
species = [s for _, s in zip(numbers, anonymous_element_generator())]

elif alphabetize:
species, numbers = zip(*sorted(zip(species, numbers)))

return "".join(f"{s}{n if n != 1 else ''}" for n, s in zip(numbers, species))


def anonymize_formula(formula: str) -> str:
"""Takes a string representation of a chemical formula of the form `[A-Z][a-z]*[0-9]*` (potentially with whitespace) and
returns the OPTIMADE `chemical_formula_anonymous` representation, i.e., a reduced chemical formula comprising of element symbols
drawn from A, B, C... ordered from largest proportion to smallest.

Returns:
The anonymous chemical formula in the OPTIMADE representation.

"""
return _reduce_or_anonymize_formula(formula, alphabetize=False, anonymize=True)


def reduce_formula(formula: str) -> str:
"""Takes a string representation of a chemical formula of the form `[A-Z][a-z]*[0-9]*` (potentially with whitespace) and
reduces it by the GCD of the proportion integers present in the formula, stripping any leftover "1" values.

Returns:
The reduced chemical formula in the OPTIMADE representation.

"""
return _reduce_or_anonymize_formula(formula, alphabetize=True, anonymize=False)


ANONYMOUS_ELEMENTS = tuple(itertools.islice(anonymous_element_generator(), 150))
""" Returns the first 150 values of the anonymous element generator. """

Expand Down
Loading