Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

142 model download change #150

Merged
merged 20 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions aiida_mlip/calculations/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Base class for features common to most calculations."""

import shutil

from ase.io import read, write

from aiida.common import InputValidationError, datastructures
Expand Down Expand Up @@ -199,8 +201,6 @@ def prepare_for_submission(
An instance of `aiida.common.datastructures.CalcInfo`.
"""

# Create needed inputs

if "struct" in self.inputs:
structure = self.inputs.struct
elif "config" in self.inputs and "struct" in self.inputs.config.as_dictionary:
Expand All @@ -211,8 +211,8 @@ def prepare_for_submission(
# Transform the structure data in xyz file called input_filename
input_filename = self.inputs.metadata.options.input_filename
atoms = structure.get_ase()
# with folder.open(input_filename, mode="w", encoding='utf8') as file:
write(folder.abspath + "/" + input_filename, images=atoms)
with folder.open(input_filename, mode="w", encoding="utf8") as file:
write(file.name, images=atoms)

log_filename = (self.inputs.log_filename).value
cmd_line = {
Expand All @@ -231,7 +231,7 @@ def prepare_for_submission(
# Define architecture from model if model is given,
# otherwise get architecture from inputs and download default model
self._add_arch_to_cmdline(cmd_line)
self._add_model_to_cmdline(cmd_line)
self._add_model_to_cmdline(cmd_line, folder)

if "config" in self.inputs:
# Add config file to command line
Expand Down Expand Up @@ -290,8 +290,7 @@ def _add_arch_to_cmdline(self, cmd_line: dict) -> dict:
cmd_line["arch"] = architecture

def _add_model_to_cmdline(
self,
cmd_line: dict,
self, cmd_line: dict, folder: aiida.common.folders.Folder
) -> dict:
"""
Find model in inputs or config file and add to command line if needed.
Expand All @@ -301,6 +300,9 @@ def _add_model_to_cmdline(
cmd_line : dict
Dictionary containing the cmd line keys.

folder : aiida.common.folders.Folder
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
An `aiida.common.folders.Folder` to temporarily write files on disk.
federicazanca marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
dict
Expand All @@ -311,6 +313,10 @@ def _add_model_to_cmdline(
# Raise error if model is None (different than model not given as input)
if self.inputs.model is None:
raise ValueError("Model cannot be None")
model_path = self.inputs.model.filepath
if model_path:

with self.inputs.model.open(mode="rb") as source:
with folder.open("modelcopy.model", mode="wb") as target:
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
shutil.copyfileobj(source, target)

model_path = "modelcopy.model"
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
cmd_line.setdefault("calc-kwargs", {})["model"] = model_path
8 changes: 5 additions & 3 deletions aiida_mlip/calculations/train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Class for training machine learning models."""

from pathlib import Path
import shutil

from aiida.common import InputValidationError, datastructures
import aiida.common.folders
Expand Down Expand Up @@ -175,9 +176,10 @@ def prepare_for_submission(

# Add foundation_model to the config file if fine-tuning is enabled
if self.inputs.fine_tune and "foundation_model" in self.inputs:
model_data = self.inputs.foundation_model
foundation_model_path = model_data.filepath
config_parse += f"\nfoundation_model: {foundation_model_path}"
with self.inputs.foundation_model.open(mode="rb") as source:
with folder.open("modelcopy.model", mode="wb") as target:
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
shutil.copyfileobj(source, target)
config_parse += "foundation_model: modelcopy.model"
federicazanca marked this conversation as resolved.
Show resolved Hide resolved

# Copy config file content inside the folder where the calculation is run
config_copy = "mlip_train.yml"
Expand Down
128 changes: 48 additions & 80 deletions aiida_mlip/data/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from pathlib import Path
from typing import Any, Optional, Union
from urllib import request
from urllib.parse import urlparse

from aiida.orm import SinglefileData
from aiida.orm import QueryBuilder, SinglefileData, load_node
from aiida.tools import delete_nodes


class ModelData(SinglefileData):
Expand All @@ -26,16 +26,16 @@ class ModelData(SinglefileData):
----------
architecture : str
Architecture of the mlip model.
filepath : str
Path of the mlip model.
model_hash : str
Hash of the model.

Methods
-------
set_file(file, filename=None, architecture=None, **kwargs)
Set the file for the node.
local_file(file, architecture, filename=None):
from_local(file, architecture, filename=None):
Create a ModelData instance from a local file.
download(url, architecture, filename=None, cache_dir=None, force_download=False)
from_url(url, architecture, filename=None, cache_dir=None, keep_file=False)
Download a file from a URL and save it as ModelData.

Other Parameters
Expand Down Expand Up @@ -69,47 +69,6 @@ def _calculate_hash(file: Union[str, Path]) -> str:
file_hash = sha256.hexdigest()
return file_hash

@classmethod
def _check_existing_file(cls, file: Union[str, Path]) -> Path:
"""
Check if a file already exists and return the path of the existing file.

Parameters
----------
file : Union[str, Path]
Path to the downloaded model file.

Returns
-------
Path
The path of the model file of interest (same as input path if no duplicates
were found).
"""
file_hash = cls._calculate_hash(file)

def is_diff_file(curr_path: Path) -> bool:
"""
Filter to check if two files are different.

Parameters
----------
curr_path : Path
Path to the file to compare with.

Returns
-------
bool
True if the files are different, False otherwise.
"""
return curr_path.is_file() and not curr_path.samefile(file)

file_folder = Path(file).parent
for existing_file in filter(is_diff_file, file_folder.rglob("*")):
if cls._calculate_hash(existing_file) == file_hash:
file.unlink()
return existing_file
return Path(file)

def __init__(
self,
file: Union[str, Path],
Expand All @@ -136,7 +95,6 @@ def __init__(
"""
super().__init__(file, filename, **kwargs)
self.base.attributes.set("architecture", architecture)
self.base.attributes.set("filepath", str(file))

def set_file(
self,
Expand Down Expand Up @@ -164,10 +122,12 @@ def set_file(
"""
super().set_file(file, filename, **kwargs)
self.base.attributes.set("architecture", architecture)
self.base.attributes.set("filepath", str(file))
# here compute hash and set attribute
model_hash = self._calculate_hash(file)
self.base.attributes.set("model_hash", model_hash)

@classmethod
def local_file(
def from_local(
cls,
file: Union[str, Path],
architecture: str,
Expand Down Expand Up @@ -195,13 +155,13 @@ def local_file(

@classmethod
# pylint: disable=too-many-arguments
def download(
def from_url(
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
cls,
url: str,
architecture: str,
filename: Optional[str] = None,
filename: Optional[str] = "tmp_file.model",
cache_dir: Optional[Union[str, Path]] = None,
force_download: Optional[bool] = False,
keep_file: Optional[bool] = False,
):
"""
Download a file from a URL and save it as ModelData.
Expand All @@ -213,13 +173,13 @@ def download(
architecture : [str]
Architecture of the mlip model.
filename : Optional[str], optional
Name to be used for the file (defaults to the name of provided file).
Name to be used for the file defaults to tmp_file.model.
cache_dir : Optional[Union[str, Path]], optional
Path to the folder where the file has to be saved
(defaults to "~/.cache/mlips/").
force_download : Optional[bool], optional
True to keep the downloaded model even if there are duplicates
(default: False).
keep_file : Optional[bool], optional
True to keep the downloaded model, even if there are duplicates.
(default: False, the file is deleted and only saved in the database).

Returns
-------
Expand All @@ -231,32 +191,40 @@ def download(
)
arch_dir = (cache_dir / architecture) if architecture else cache_dir

# cache_path = cache_dir.resolve()
arch_path = arch_dir.resolve()
arch_path.mkdir(parents=True, exist_ok=True)

model_name = urlparse(url).path.split("/")[-1]

file = arch_path / filename if filename else arch_path / model_name

# If file already exists, use next indexed name
stem = file.stem
i = 1
while file.exists():
i += 1
file = file.with_stem(f"{stem}_{i}")
file = arch_path / filename

# Download file
request.urlretrieve(url, file)

if force_download:
print(f"filename changed to {file}")
return cls.local_file(file=file, architecture=architecture)

# Check if the hash of the just downloaded file matches any other file
filepath = cls._check_existing_file(file)

return cls.local_file(file=filepath, architecture=architecture)
model = cls.from_local(file=file, architecture=architecture)

if keep_file:
return model

file.unlink(missing_ok=True)

qb = QueryBuilder()
federicazanca marked this conversation as resolved.
Show resolved Hide resolved
qb.append(ModelData, project=["attributes", "pk", "ctime"])

# Looking for ModelData in the whole database
for i in qb.iterdict():
# If the hash is the same as the new model, but not the creation time
# it means that there is already a model that is the same, use that
if i["ModelData_1"]["attributes"]["model_hash"] == model.model_hash:
if i["ModelData_1"]["ctime"] != model.ctime:
delete_nodes(
[model.uuid],
dry_run=False,
create_forward=True,
call_calc_forward=True,
call_work_forward=True,
)
model = load_node(i["ModelData_1"]["pk"])
break
return model

@property
def architecture(self) -> str:
Expand All @@ -271,13 +239,13 @@ def architecture(self) -> str:
return self.base.attributes.get("architecture")

@property
def filepath(self) -> str:
def model_hash(self) -> str:
"""
Return the filepath.
Return hash of the architecture.

Returns
-------
str
Path of the mlip model.
Hash of the MLIP model.
"""
return self.base.attributes.get("filepath")
return self.base.attributes.get("model_hash")
6 changes: 3 additions & 3 deletions aiida_mlip/helpers/help_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ def load_model(
The loaded model.
"""
if model is None:
loaded_model = ModelData.download(
loaded_model = ModelData.from_url(
"https://github.com/stfc/janus-core/raw/main/tests/models/mace_mp_small.model", # pylint: disable=line-too-long
architecture,
cache_dir=cache_dir,
)
elif (file_path := Path(model)).is_file():
loaded_model = ModelData.local_file(file_path, architecture=architecture)
loaded_model = ModelData.from_local(file_path, architecture=architecture)
else:
loaded_model = ModelData.download(
loaded_model = ModelData.from_url(
model, architecture=architecture, cache_dir=cache_dir
)
return loaded_model
Expand Down
4 changes: 2 additions & 2 deletions aiida_mlip/parsers/train_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def _save_models(self, model_output: Path, compiled_model_output: Path) -> None:
Path to the compiled model output file.
"""
architecture = "mace_mp"
model = ModelData.local_file(model_output, architecture=architecture)
compiled_model = ModelData.local_file(
model = ModelData.from_local(model_output, architecture=architecture)
compiled_model = ModelData.from_local(
compiled_model_output, architecture=architecture
)

Expand Down
19 changes: 3 additions & 16 deletions docs/source/user_guide/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,34 +23,21 @@ Usage

.. code-block:: python

model = ModelData.local_file('/path/to/file', filename='model', architecture='mace')
model = ModelData.from_local('/path/to/file', filename='model', architecture='mace')

- To download a file and save it as a `ModelData` object:

.. code-block:: python

model = ModelData.download('http://yoururl.test/model', architecture='mace', filename='model', cache_dir='/home/mlip/', force_download=False)
model = ModelData.from_url('http://yoururl.test/model', architecture='mace', filename='model', cache_dir='/home/mlip/', force_download=False)

- The architecture of the model file can be accessed using the `architecture` property:

.. code-block:: python

model_arch = model.architecture



- The filepath of the model file can be accessed using the `filepath` property:
federicazanca marked this conversation as resolved.
Show resolved Hide resolved

.. code-block:: python

file_path = model.filepath

.. warning::

When using shared data, the ``filepath`` could point to a inaccessible location on another computer.
So if you are using data from someone else, for both the model data and the config file, consider using the ``get_content()`` method to create a new file with identical content.
Then, use the filepath of the newly created file for running calculation.
A more robust solution to this problem is going to be implemented.
As for a `SinglefileData`, the content of the model file can be accessed using the function `get_content()`


JanusConfigfile
Expand Down
4 changes: 2 additions & 2 deletions docs/source/user_guide/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ In this example we use MACE with a model that we download from this URL: "https:

from aiida_mlip.data.model import ModelData
url = "https://github.com/stfc/janus-core/raw/main/tests/models/mace_mp_small.model"
model = ModelData.download(url, architecture="mace", cache_dir="/.cache/")
model = ModelData.from_url(url, architecture="mace", cache_dir="/.cache/")

If we already have the model saved in some folder we can save it as:

.. code-block:: python

model = ModelData.local_file("/path/to/model", architecture="mace")
model = ModelData.from_local("/path/to/model", architecture="mace")

Another parameter that we need to define as AiiDA type is the code. Assuming the code is saved as `janus` in the `localhost` computer, the code info that are needed can be loaded as follow:

Expand Down
Loading