diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index f20378314a..ca88025c88 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -63,6 +63,10 @@ Models may contain the following entries: | |``None`` |if any. (:ref:`See | | | |Below`) | +--------------------+-----------+------------------------------+ +|**md5** |unicode or |The md5 of the contents. | +| |``None`` | | +| | | | ++--------------------+-----------+------------------------------+ .. _modelcontent: @@ -76,6 +80,8 @@ model. There are three model types: **notebook**, **file**, and **directory**. :class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file represented by the model. See the `NBFormat`_ documentation for a full description. + - The ``md5`` field a hexdigest string of the md5 value of the notebook + file. - ``file`` models - The ``format`` field is either ``"text"`` or ``"base64"``. @@ -85,12 +91,14 @@ model. There are three model types: **notebook**, **file**, and **directory**. file models, ``content`` simply contains the file's bytes after decoding as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded, and then decoded as UTF-8. + - The ``md5`` field a hexdigest string of the md5 value of the file. - ``directory`` models - The ``format`` field is always ``"json"``. - The ``mimetype`` field is always ``None``. - The ``content`` field contains a list of :ref:`content-free` models representing the entities in the directory. + - The ``md5`` field is always ``None``. .. note:: @@ -129,6 +137,7 @@ model. There are three model types: **notebook**, **file**, and **directory**. "path": "foo/a.ipynb", "type": "notebook", "writable": True, + "md5": "7e47382b370c05a1b14706a2a8aff91a", } # Notebook Model without Content diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index 3033ebe3fa..ba84a3733b 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -4,6 +4,7 @@ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. import errno +import hashlib import os import shutil from base64 import decodebytes, encodebytes @@ -268,7 +269,9 @@ def _read_notebook(self, os_path, as_version=4, capture_validation_error=None): with self.open(os_path, "r", encoding="utf-8") as f: try: return nbformat.read( - f, as_version=as_version, capture_validation_error=capture_validation_error + f, + as_version=as_version, + capture_validation_error=capture_validation_error, ) except Exception as e: e_orig = e @@ -309,6 +312,7 @@ def _read_file(self, os_path, format): format: If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. If not specified, try to decode as UTF-8, and fall back to base64 """ if not os.path.isfile(os_path): @@ -316,6 +320,9 @@ def _read_file(self, os_path, format): with self.open(os_path, "rb") as f: bcontent = f.read() + if format == "byte": + # Not for http response but internal use + return bcontent, "byte" if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode @@ -350,6 +357,12 @@ def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: f.write(bcontent) + def _get_md5(self, os_path): + c, _ = self._read_file(os_path, "byte") + md5 = hashlib.md5() # noqa: S324 + md5.update(c) + return md5.hexdigest() + class AsyncFileManagerMixin(FileManagerMixin): """ @@ -417,6 +430,7 @@ async def _read_file(self, os_path, format): format: If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. If not specified, try to decode as UTF-8, and fall back to base64 """ if not os.path.isfile(os_path): @@ -424,6 +438,9 @@ async def _read_file(self, os_path, format): with self.open(os_path, "rb") as f: bcontent = await run_sync(f.read) + if format == "byte": + # Not for http response but internal use + return bcontent, "byte" if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode @@ -457,3 +474,9 @@ async def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: await run_sync(f.write, bcontent) + + async def _get_md5(self, os_path): + c, _ = await self._read_file(os_path, "byte") + md5 = hashlib.md5() # noqa: S324 + await run_sync(md5.update, c) + return md5.hexdigest() diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index 64b5fc122a..fe027a5c49 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -268,6 +268,7 @@ def _base_model(self, path): model["mimetype"] = None model["size"] = size model["writable"] = self.is_writable(path) + model["md5"] = None return model @@ -335,7 +336,7 @@ def _dir_model(self, path, content=True): return model - def _file_model(self, path, content=True, format=None): + def _file_model(self, path, content=True, format=None, md5=False): """Build a model for a file if content is requested, include the file contents. @@ -364,10 +365,13 @@ def _file_model(self, path, content=True, format=None): content=content, format=format, ) + if md5: + md5 = self._get_md5(os_path) + model.update(md5=md5) return model - def _notebook_model(self, path, content=True): + def _notebook_model(self, path, content=True, md5=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -386,10 +390,12 @@ def _notebook_model(self, path, content=True): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) + if md5: + model["md5"] = self._get_md5(os_path) return model - def get(self, path, content=True, type=None, format=None): + def get(self, path, content=True, type=None, format=None, md5=None): """Takes a path for an entity and returns its model Parameters @@ -404,6 +410,8 @@ def get(self, path, content=True, type=None, format=None): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. + md5: bool, optional + Whether to include the md5 of the file contents. Returns ------- @@ -431,11 +439,11 @@ def get(self, path, content=True, type=None, format=None): ) model = self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = self._notebook_model(path, content=content) + model = self._notebook_model(path, content=content, md5=md5) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = self._file_model(path, content=content, format=format) + model = self._file_model(path, content=content, format=format, md5=md5) self.emit(data={"action": "get", "path": path}) return model @@ -686,7 +694,9 @@ def _get_dir_size(self, path="."): ).stdout.split() else: result = subprocess.run( - ["du", "-s", "--block-size=1", path], capture_output=True, check=True + ["du", "-s", "--block-size=1", path], + capture_output=True, + check=True, ).stdout.split() self.log.info(f"current status of du command {result}") @@ -784,7 +794,7 @@ async def _dir_model(self, path, content=True): return model - async def _file_model(self, path, content=True, format=None): + async def _file_model(self, path, content=True, format=None, md5=False): """Build a model for a file if content is requested, include the file contents. @@ -813,10 +823,13 @@ async def _file_model(self, path, content=True, format=None): content=content, format=format, ) + if md5: + md5 = await self._get_md5(os_path) + model.update(md5=md5) return model - async def _notebook_model(self, path, content=True): + async def _notebook_model(self, path, content=True, md5=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -835,10 +848,12 @@ async def _notebook_model(self, path, content=True): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) + if md5: + model["md5"] = await self._get_md5(os_path) return model - async def get(self, path, content=True, type=None, format=None): + async def get(self, path, content=True, type=None, format=None, md5=False): """Takes a path for an entity and returns its model Parameters @@ -853,6 +868,8 @@ async def get(self, path, content=True, type=None, format=None): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. + md5: bool, optional + Whether to include the md5 of the file contents. Returns ------- @@ -875,11 +892,11 @@ async def get(self, path, content=True, type=None, format=None): ) model = await self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = await self._notebook_model(path, content=content) + model = await self._notebook_model(path, content=content, md5=md5) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = await self._file_model(path, content=content, format=format) + model = await self._file_model(path, content=content, format=format, md5=md5) self.emit(data={"action": "get", "path": path}) return model @@ -1147,7 +1164,9 @@ async def _get_dir_size(self, path: str = ".") -> str: ).stdout.split() else: result = subprocess.run( - ["du", "-s", "--block-size=1", path], capture_output=True, check=True + ["du", "-s", "--block-size=1", path], + capture_output=True, + check=True, ).stdout.split() self.log.info(f"current status of du command {result}") diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index 4a3dbab19f..cc5ac5b8ca 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -6,6 +6,7 @@ # Distributed under the terms of the Modified BSD License. import json from http import HTTPStatus +from typing import Any, Dict, List try: from jupyter_client.jsonutil import json_default @@ -22,12 +23,35 @@ AUTH_RESOURCE = "contents" -def validate_model(model, expect_content): +def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none_keys: List[str]): + """ + Validate that the keys in maybe_none_keys are None or not None + """ + + if expect_in_model: + errors = [key for key in maybe_none_keys if model[key] is None] + if errors: + raise web.HTTPError( + 500, + f"Keys unexpectedly None: {errors}", + ) + else: + errors = {key: model[key] for key in maybe_none_keys if model[key] is not None} # type: ignore[assignment] + if errors: + raise web.HTTPError( + 500, + f"Keys unexpectedly not None: {errors}", + ) + + +def validate_model(model, expect_content, expect_md5): """ Validate a model returned by a ContentsManager method. If expect_content is True, then we expect non-null entries for 'content' and 'format'. + + If expect_md5 is True, then we expect non-null entries for 'md5'. """ required_keys = { "name", @@ -39,6 +63,7 @@ def validate_model(model, expect_content): "mimetype", "content", "format", + "md5", } missing = required_keys - set(model.keys()) if missing: @@ -47,21 +72,10 @@ def validate_model(model, expect_content): f"Missing Model Keys: {missing}", ) - maybe_none_keys = ["content", "format"] - if expect_content: - errors = [key for key in maybe_none_keys if model[key] is None] - if errors: - raise web.HTTPError( - 500, - f"Keys unexpectedly None: {errors}", - ) - else: - errors = {key: model[key] for key in maybe_none_keys if model[key] is not None} # type: ignore[assignment] - if errors: - raise web.HTTPError( - 500, - f"Keys unexpectedly not None: {errors}", - ) + content_keys = ["content", "format"] + md5_keys = ["md5"] + _validate_in_or_not(expect_content, model, content_keys) + _validate_in_or_not(expect_md5, model, md5_keys) class ContentsAPIHandler(APIHandler): @@ -122,6 +136,11 @@ async def get(self, path=""): raise web.HTTPError(400, "Content %r is invalid" % content_str) content = int(content_str or "") + md5_str = self.get_query_argument("md5", default="0") + if md5_str not in {"0", "1"}: + raise web.HTTPError(400, "Content %r is invalid" % md5_str) + md5 = int(md5_str or "") + if not cm.allow_hidden and await ensure_async(cm.is_hidden(path)): await self._finish_error( HTTPStatus.NOT_FOUND, f"file or directory {path!r} does not exist" @@ -133,9 +152,10 @@ async def get(self, path=""): type=type, format=format, content=content, + md5=md5, ) ) - validate_model(model, expect_content=content) + validate_model(model, expect_content=content, expect_md5=md5) self._finish_model(model, location=False) except web.HTTPError as exc: # 404 is okay in this context, catch exception and return 404 code to prevent stack trace on client @@ -165,7 +185,7 @@ async def patch(self, path=""): raise web.HTTPError(400, f"Cannot rename file or directory {path!r}") model = await ensure_async(cm.update(model, path)) - validate_model(model, expect_content=False) + validate_model(model, expect_content=False, expect_md5=False) self._finish_model(model) async def _copy(self, copy_from, copy_to=None): @@ -178,7 +198,7 @@ async def _copy(self, copy_from, copy_to=None): ) model = await ensure_async(self.contents_manager.copy(copy_from, copy_to)) self.set_status(201) - validate_model(model, expect_content=False) + validate_model(model, expect_content=False, expect_md5=False) self._finish_model(model) async def _upload(self, model, path): @@ -186,7 +206,7 @@ async def _upload(self, model, path): self.log.info("Uploading file to %s", path) model = await ensure_async(self.contents_manager.new(model, path)) self.set_status(201) - validate_model(model, expect_content=False) + validate_model(model, expect_content=False, expect_md5=False) self._finish_model(model) async def _new_untitled(self, path, type="", ext=""): @@ -196,7 +216,7 @@ async def _new_untitled(self, path, type="", ext=""): self.contents_manager.new_untitled(path=path, type=type, ext=ext) ) self.set_status(201) - validate_model(model, expect_content=False) + validate_model(model, expect_content=False, expect_md5=False) self._finish_model(model) async def _save(self, model, path): @@ -205,7 +225,7 @@ async def _save(self, model, path): if not chunk or chunk == -1: # Avoid tedious log information self.log.info("Saving file at %s", path) model = await ensure_async(self.contents_manager.save(model, path)) - validate_model(model, expect_content=False) + validate_model(model, expect_content=False, expect_md5=False) self._finish_model(model) @web.authenticated @@ -356,7 +376,13 @@ async def delete(self, path, checkpoint_id): class NotebooksRedirectHandler(JupyterHandler): """Redirect /api/notebooks to /api/contents""" - SUPPORTED_METHODS = ("GET", "PUT", "PATCH", "POST", "DELETE") # type:ignore[assignment] + SUPPORTED_METHODS = ( + "GET", + "PUT", + "PATCH", + "POST", + "DELETE", + ) # type:ignore[assignment] def get(self, path): """Handle a notebooks redirect.""" diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index f4f70fc338..94684bb022 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -447,7 +447,7 @@ def exists(self, path): """ return self.file_exists(path) or self.dir_exists(path) - def get(self, path, content=True, type=None, format=None): + def get(self, path, content=True, type=None, format=None, md5=False): """Get a file or directory model.""" raise NotImplementedError diff --git a/tests/services/contents/test_api.py b/tests/services/contents/test_api.py index 746c663345..8733b02ae7 100644 --- a/tests/services/contents/test_api.py +++ b/tests/services/contents/test_api.py @@ -102,6 +102,20 @@ async def test_get_nb_contents(jp_fetch, contents, path, name): assert isinstance(model["content"]["metadata"], dict) +@pytest.mark.parametrize("path,name", dirs) +async def test_get_nb_md5(jp_fetch, contents, path, name): + nbname = name + ".ipynb" + nbpath = (path + "/" + nbname).lstrip("/") + r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(md5="1")) + model = json.loads(r.body.decode()) + assert model["name"] == nbname + assert model["path"] == nbpath + assert model["type"] == "notebook" + assert "md5" in model + assert "metadata" in model["content"] + assert isinstance(model["content"]["metadata"], dict) + + @pytest.mark.parametrize("path,name", dirs) async def test_get_nb_no_contents(jp_fetch, contents, path, name): nbname = name + ".ipynb" @@ -186,6 +200,19 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): assert expected_http_error(e, 400) +@pytest.mark.parametrize("path,name", dirs) +async def test_get_text_file_md5(jp_fetch, contents, path, name): + txtname = name + ".txt" + txtpath = (path + "/" + txtname).lstrip("/") + r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(md5="1")) + model = json.loads(r.body.decode()) + assert model["name"] == txtname + assert model["path"] == txtpath + assert "md5" in model + assert model["format"] == "text" + assert model["type"] == "file" + + async def test_get_404_hidden(jp_fetch, contents, contents_dir): # Create text files hidden_dir = contents_dir / ".hidden" diff --git a/tests/services/contents/test_fileio.py b/tests/services/contents/test_fileio.py index 0f0cf1bfed..a72acfa429 100644 --- a/tests/services/contents/test_fileio.py +++ b/tests/services/contents/test_fileio.py @@ -142,6 +142,8 @@ def test_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") + # Same as `echo -n {} | md5sum` + assert mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" with pytest.raises(HTTPError): mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) @@ -164,6 +166,8 @@ async def test_async_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") + # Same as `echo -n {} | md5sum` + assert await mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" with pytest.raises(HTTPError): await mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index 8d4052dd2d..6e3ac01945 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -571,6 +571,9 @@ async def test_get(jp_contents_manager): # noqa nb_as_bin_file = await ensure_async(cm.get(path, content=True, type="file", format="base64")) assert nb_as_bin_file["format"] == "base64" + nb_with_md5 = await ensure_async(cm.get(path, md5=True)) + assert nb_with_md5["md5"] + # Test in sub-directory sub_dir = "/foo/" _make_dir(cm, "foo") @@ -585,7 +588,7 @@ async def test_get(jp_contents_manager): # noqa # Test with a regular file. file_model_path = (await ensure_async(cm.new_untitled(path=sub_dir, ext=".txt")))["path"] - file_model = await ensure_async(cm.get(file_model_path)) + file_model = await ensure_async(cm.get(file_model_path, md5=True)) expected_model = { "content": "", "format": "text", @@ -600,6 +603,7 @@ async def test_get(jp_contents_manager): # noqa assert file_model[key] == value assert "created" in file_model assert "last_modified" in file_model + assert "md5" in file_model # Create a sub-sub directory to test getting directory contents with a # subdir.