From 7f8870f59ab163d7b06a3a0f8ced816e25526c69 Mon Sep 17 00:00:00 2001 From: Nell Hardcastle Date: Tue, 27 Sep 2022 14:02:17 -0700 Subject: [PATCH 01/34] refactor(worker): Only retrieve files in per tree object chunks This breaking API change for the worker moves to per git tree file requests instead of prefix slicing the entire file tree. This allows for more efficient top level requests and recursion by fetching smaller slices of the dataset as needed. --- services/datalad/datalad_service/app.py | 3 + .../datalad/datalad_service/common/annex.py | 19 +++-- .../datalad/datalad_service/handlers/draft.py | 6 +- .../datalad/datalad_service/handlers/files.py | 79 ++++++++----------- .../datalad_service/handlers/snapshots.py | 5 +- .../datalad/datalad_service/handlers/tree.py | 21 +++++ .../datalad/datalad_service/tasks/files.py | 4 +- .../datalad_service/tasks/snapshots.py | 3 +- services/datalad/tests/test_files.py | 50 ++++++++---- 9 files changed, 114 insertions(+), 76 deletions(-) create mode 100644 services/datalad/datalad_service/handlers/tree.py diff --git a/services/datalad/datalad_service/app.py b/services/datalad/datalad_service/app.py index 46343c8397..f3769705a3 100644 --- a/services/datalad/datalad_service/app.py +++ b/services/datalad/datalad_service/app.py @@ -17,6 +17,7 @@ from datalad_service.handlers.snapshots import SnapshotResource from datalad_service.handlers.heartbeat import HeartbeatResource from datalad_service.handlers.publish import PublishResource +from datalad_service.handlers.tree import TreeResource from datalad_service.handlers.upload import UploadResource from datalad_service.handlers.upload_file import UploadFileResource from datalad_service.handlers.validation import ValidationResource @@ -66,6 +67,7 @@ def create_app(annex_path): dataset_files = FilesResource(store) dataset_annex_objects = AnnexObjectsResource(store) dataset_publish = PublishResource(store) + dataset_tree = TreeResource(store) dataset_snapshots = SnapshotResource(store) dataset_upload = UploadResource(store) dataset_upload_file = UploadFileResource(store) @@ -90,6 +92,7 @@ def create_app(annex_path): api.add_route('/datasets/{dataset}/files', dataset_files) api.add_route('/datasets/{dataset}/files/{filename:path}', dataset_files) + api.add_route('/datasets/{dataset}/tree/{tree}', dataset_tree) api.add_route('/datasets/{dataset}/snapshots', dataset_snapshots) api.add_route( diff --git a/services/datalad/datalad_service/common/annex.py b/services/datalad/datalad_service/common/annex.py index 8bd377a6e1..1a8fc84b2e 100644 --- a/services/datalad/datalad_service/common/annex.py +++ b/services/datalad/datalad_service/common/annex.py @@ -50,9 +50,9 @@ def read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects): filename, mode, obj_type, obj_hash, size = parse_ls_tree_line( gitTreeLine) # Skip git / datalad files - if filename.startswith('.git/'): + if filename.startswith('.git'): return - if filename.startswith('.datalad/'): + if filename.startswith('.datalad'): return if filename == '.gitattributes': return @@ -66,9 +66,14 @@ def read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects): return else: # Immediately append regular files - file_id = compute_file_hash(obj_hash, filename) - files.append({'filename': filename, 'size': int(size), - 'id': file_id, 'key': obj_hash, 'urls': [], 'annexed': False}) + if (size == '-'): + # Tree objects do not have sizes and are never annexed + files.append( + {'id': obj_hash, 'filename': filename, 'directory': True, 'annexed': False, 'size': 0, 'urls': []}) + else: + file_id = compute_file_hash(obj_hash, filename) + files.append({'filename': filename, 'size': int(size), + 'id': file_id, 'key': obj_hash, 'urls': [], 'annexed': False}) def compute_rmet(key): @@ -176,10 +181,10 @@ def get_repo_urls(path, files): return files -def get_repo_files(dataset_path, branch='HEAD'): +def get_repo_files(dataset_path, tree): """Read all files in a repo at a given branch, tag, or commit hash.""" gitProcess = subprocess.Popen( - ['git', 'ls-tree', '-l', '-r', branch], cwd=dataset_path, stdout=subprocess.PIPE, encoding='utf-8') + ['git', 'ls-tree', '-l', tree], cwd=dataset_path, stdout=subprocess.PIPE, encoding='utf-8') files = [] symlinkFilenames = [] symlinkObjects = [] diff --git a/services/datalad/datalad_service/handlers/draft.py b/services/datalad/datalad_service/handlers/draft.py index 8cfe59b5a3..07d2efe8fe 100644 --- a/services/datalad/datalad_service/handlers/draft.py +++ b/services/datalad/datalad_service/handlers/draft.py @@ -1,5 +1,3 @@ -import sys - import falcon import pygit2 @@ -19,7 +17,9 @@ def on_get(self, req, resp, dataset): # Maybe turn this into status? dataset_path = self.store.get_dataset_path(dataset) repo = pygit2.Repository(dataset_path) - resp.media = {'hexsha': repo.head.target.hex} + commit = repo.revparse_single('HEAD') + resp.media = {'hexsha': commit.hex, + 'tree': commit.tree_id.hex} resp.status = falcon.HTTP_OK else: resp.status = falcon.HTTP_NOT_FOUND diff --git a/services/datalad/datalad_service/handlers/files.py b/services/datalad/datalad_service/handlers/files.py index 604c340ac1..491f4ce1ca 100644 --- a/services/datalad/datalad_service/handlers/files.py +++ b/services/datalad/datalad_service/handlers/files.py @@ -6,7 +6,6 @@ from datalad_service.common.git import git_show from datalad_service.common.user import get_user_info from datalad_service.common.stream import update_file -from datalad_service.tasks.files import get_files from datalad_service.tasks.files import remove_files @@ -16,52 +15,42 @@ def __init__(self, store): self.store = store self.logger = logging.getLogger('datalad_service.' + __name__) - def on_get(self, req, resp, dataset, filename=None, snapshot='HEAD'): + def on_get(self, req, resp, dataset, filename, snapshot='HEAD'): ds_path = self.store.get_dataset_path(dataset) - if filename: - try: - file_content = git_show(ds_path, snapshot, filename) - # If the file begins with an annex path, return that path - if file_content[0:4096].find('.git/annex') != -1: - # Resolve absolute path for annex target - target_path = os.path.join( - ds_path, os.path.dirname(filename), file_content) - # Verify the annex path is within the dataset dir - if ds_path == os.path.commonpath((ds_path, target_path)): - fd = open(target_path, 'rb') - resp.stream = fd - resp.stream_len = os.fstat(fd.fileno()).st_size - resp.status = falcon.HTTP_OK - else: - resp.media = {'error': 'file not found in git tree'} - resp.status = falcon.HTTP_NOT_FOUND - else: - resp.body = file_content + try: + file_content = git_show(ds_path, snapshot, filename) + # If the file begins with an annex path, return that path + if file_content[0:4096].find('.git/annex') != -1: + # Resolve absolute path for annex target + target_path = os.path.join( + ds_path, os.path.dirname(filename), file_content) + # Verify the annex path is within the dataset dir + if ds_path == os.path.commonpath((ds_path, target_path)): + fd = open(target_path, 'rb') + resp.stream = fd + resp.stream_len = os.fstat(fd.fileno()).st_size resp.status = falcon.HTTP_OK - except KeyError: - # File is not present in tree - resp.media = {'error': 'file not found in git tree'} - resp.status = falcon.HTTP_NOT_FOUND - except IOError: - # File is not kept locally - resp.media = {'error': 'file not found'} - resp.status = falcon.HTTP_NOT_FOUND - except: - # Some unknown error - resp.media = { - 'error': 'an unknown error occurred accessing this file'} - resp.status = falcon.HTTP_INTERNAL_SERVER_ERROR - self.logger.exception( - 'An unknown error processing file "{}"'.format(filename)) - else: - # Request for index of files - # Return a list of file objects - # {name, path, size} - try: - files = get_files(self.store, dataset, snapshot) - resp.media = {'files': files} - except: - resp.status = falcon.HTTP_INTERNAL_SERVER_ERROR + else: + resp.media = {'error': 'file not found in git tree'} + resp.status = falcon.HTTP_NOT_FOUND + else: + resp.body = file_content + resp.status = falcon.HTTP_OK + except KeyError: + # File is not present in tree + resp.media = {'error': 'file not found in git tree'} + resp.status = falcon.HTTP_NOT_FOUND + except IOError: + # File is not kept locally + resp.media = {'error': 'file not found'} + resp.status = falcon.HTTP_NOT_FOUND + except: + # Some unknown error + resp.media = { + 'error': 'an unknown error occurred accessing this file'} + resp.status = falcon.HTTP_INTERNAL_SERVER_ERROR + self.logger.exception( + 'An unknown error processing file "{}"'.format(filename)) def on_post(self, req, resp, dataset, filename): """Post will create new files and adds them to the annex if they do not exist, else update existing files.""" diff --git a/services/datalad/datalad_service/handlers/snapshots.py b/services/datalad/datalad_service/handlers/snapshots.py index 0d999db5f5..4dd16ceef9 100644 --- a/services/datalad/datalad_service/handlers/snapshots.py +++ b/services/datalad/datalad_service/handlers/snapshots.py @@ -4,7 +4,7 @@ import falcon from datalad_service.tasks.snapshots import SnapshotDescriptionException, create_snapshot, get_snapshot, get_snapshots, SnapshotExistsException -from datalad_service.tasks.files import get_files +from datalad_service.tasks.files import get_tree from datalad_service.tasks.publish import export_dataset, monitor_remote_configs from datalad_service.common.git import delete_tag @@ -20,8 +20,7 @@ def __init__(self, store): def on_get(self, req, resp, dataset, snapshot=None): """Get the tree of files for a snapshot.""" if snapshot: - files = get_files(self.store, dataset, - branch=snapshot) + files = get_tree(self.store, dataset, snapshot) response = get_snapshot(self.store, dataset, snapshot) response['files'] = files resp.media = response diff --git a/services/datalad/datalad_service/handlers/tree.py b/services/datalad/datalad_service/handlers/tree.py new file mode 100644 index 0000000000..6ebde33a05 --- /dev/null +++ b/services/datalad/datalad_service/handlers/tree.py @@ -0,0 +1,21 @@ +import logging + +import falcon + +from datalad_service.tasks.files import get_tree + + +class TreeResource(object): + def __init__(self, store): + self.store = store + self.logger = logging.getLogger('datalad_service.' + __name__) + + def on_get(self, req, resp, dataset, tree): + # Request for index of files + # Return a list of file objects + # {name, path, size} + try: + files = get_tree(self.store, dataset, tree) + resp.media = {'files': files} + except: + resp.status = falcon.HTTP_INTERNAL_SERVER_ERROR diff --git a/services/datalad/datalad_service/tasks/files.py b/services/datalad/datalad_service/tasks/files.py index 49195bc4e0..bc4e803137 100644 --- a/services/datalad/datalad_service/tasks/files.py +++ b/services/datalad/datalad_service/tasks/files.py @@ -26,10 +26,10 @@ def commit_files(store, dataset, files, name=None, email=None, cookies=None): return ref -def get_files(store, dataset, branch=None): +def get_tree(store, dataset, tree): """Get the working tree, optionally a branch tree.""" dataset_path = store.get_dataset_path(dataset) - return get_repo_files(dataset_path, branch) + return get_repo_files(dataset_path, tree) def remove_files(store, dataset, paths, name=None, email=None, cookies=None): diff --git a/services/datalad/datalad_service/tasks/snapshots.py b/services/datalad/datalad_service/tasks/snapshots.py index 5540237dbe..8c1aeea5ee 100644 --- a/services/datalad/datalad_service/tasks/snapshots.py +++ b/services/datalad/datalad_service/tasks/snapshots.py @@ -26,7 +26,8 @@ def get_snapshot(store, dataset, snapshot): commit, _ = repo.resolve_refish(snapshot) hexsha = commit.hex created = commit.commit_time - return {'id': '{}:{}'.format(dataset, snapshot), 'tag': snapshot, 'hexsha': hexsha, 'created': created} + tree = commit.tree_id.hex + return {'id': '{}:{}'.format(dataset, snapshot), 'tag': snapshot, 'hexsha': hexsha, 'created': created, 'tree': tree} def get_snapshots(store, dataset): diff --git a/services/datalad/tests/test_files.py b/services/datalad/tests/test_files.py index 82b8f8b21a..db4f8a7fd8 100644 --- a/services/datalad/tests/test_files.py +++ b/services/datalad/tests/test_files.py @@ -121,13 +121,11 @@ def test_file_indexing(client, new_dataset): response = client.simulate_post('/datasets/{}/draft'.format(ds_id)) assert response.status == falcon.HTTP_OK # Get the files in the committed tree - response = client.simulate_get('/datasets/{}/files'.format(ds_id)) - assert response.status == falcon.HTTP_OK - response_content = json.loads(response.content) - print('response content:', response_content['files']) - print('not annexed files:', new_dataset.repo.is_under_annex( - ['dataset_description.json'])) - assert all(f in response_content['files'] for f in [ + root_response = client.simulate_get( + '/datasets/{}/tree/{}'.format(ds_id, 'HEAD')) + assert root_response.status == falcon.HTTP_OK + root_content = json.loads(root_response.content) + for f in [ {'filename': 'dataset_description.json', 'size': 101, 'id': '43502da40903d08b18b533f8897330badd6e1da3', 'key': '838d19644b3296cf32637bbdf9ae5c87db34842f', @@ -136,11 +134,27 @@ def test_file_indexing(client, new_dataset): 'id': '8a6f5281317d8a8fb695d12c940b0ff7a7dee435', 'key': 'MD5E-s8--4d87586dfb83dc4a5d15c6cfa6f61e27', 'urls': [], 'annexed': True}, - {'filename': 'sub-01/anat/sub-01_T1w.nii.gz', 'size': 19, - 'id': '7fa0e07afaec0ff2cdf1bfc783596b4472df9b12', + {'id': '2f8451ae1016f936999aaacc0b3d79fb284ac3ea', 'filename': 'sub-01', + 'directory': True, 'annexed': False, 'size': 0, 'urls': []} + ]: + assert f in root_content['files'] + # Test sub-01 directory + sub_response = client.simulate_get( + '/datasets/{}/tree/{}'.format(ds_id, next( + (f['id'] for f in root_content['files'] if f['filename'] == 'sub-01'), None))) + assert sub_response.status == falcon.HTTP_OK + sub_content = json.loads(sub_response.content) + # Test sub-01/anat directory + anat_response = client.simulate_get( + '/datasets/{}/tree/{}'.format(ds_id, next( + (f['id'] for f in sub_content['files'] if f['filename'] == 'anat'), None))) + assert anat_response.status == falcon.HTTP_OK + anat_content = json.loads(anat_response.content) + # Test sub-01/anat/sub-01_T1w.nii.gz file + assert {'filename': 'sub-01/anat/sub-01_T1w.nii.gz', 'size': 19, + 'id': 'e497096a2bce0d48b2761dade2b5c4e5a0f352bd', 'key': 'MD5E-s19--8149926e49b677a5ccecf1ad565acccf.nii.gz', - 'urls': [], 'annexed': True} - ]) + 'urls': [], 'annexed': True} in anat_content['files'] def test_empty_file(client, new_dataset): @@ -155,7 +169,7 @@ def test_empty_file(client, new_dataset): '/datasets/{}/draft'.format(ds_id), params={"validate": "false"}) assert response.status == falcon.HTTP_OK # Get the files in the committed tree - response = client.simulate_get('/datasets/{}/files'.format(ds_id)) + response = client.simulate_get('/datasets/{}/tree/HEAD'.format(ds_id)) assert response.status == falcon.HTTP_OK response_content = json.loads(response.content) # Check that all elements exist in both lists @@ -181,14 +195,20 @@ def test_duplicate_file_id(client, new_dataset): response = client.simulate_post( '/datasets/{}/draft'.format(ds_id), params={"validate": "false"}) assert response.status == falcon.HTTP_OK - response = client.simulate_get('/datasets/{}/files'.format(ds_id)) + response = client.simulate_get('/datasets/{}/tree/HEAD'.format(ds_id)) + assert response.status == falcon.HTTP_OK + response_content = json.loads(response.content) + derivatives_tree = next((f['id'] for f in response_content['files'] + if f['filename'] == 'derivatives'), None) + response = client.simulate_get( + '/datasets/{}/tree/{}'.format(ds_id, derivatives_tree)) assert response.status == falcon.HTTP_OK response_content = json.loads(response.content) # Find each file in the results file_one = next( - (f for f in response_content['files'] if f['filename'] == 'derivatives/one.json'), None) + (f for f in response_content['files'] if f['filename'] == 'one.json'), None) file_two = next( - (f for f in response_content['files'] if f['filename'] == 'derivatives/two.json'), None) + (f for f in response_content['files'] if f['filename'] == 'two.json'), None) # Validate they have differing ids assert file_one['id'] != file_two['id'] From b4a3cd3222c456410f24f11865c0df6292e18623 Mon Sep 17 00:00:00 2001 From: Nell Hardcastle Date: Thu, 29 Sep 2022 16:31:06 -0700 Subject: [PATCH 02/34] refactor(server): Use a git tree object approach for file recursion --- packages/openneuro-server/src/datalad/files.js | 15 ++++++--------- .../openneuro-server/src/datalad/snapshots.js | 5 +++-- .../src/graphql/resolvers/draft.js | 11 +++++------ .../src/graphql/resolvers/snapshots.js | 10 +++++----- packages/openneuro-server/src/graphql/schema.js | 5 ++--- 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/packages/openneuro-server/src/datalad/files.js b/packages/openneuro-server/src/datalad/files.js index 3256638970..007f0daa95 100644 --- a/packages/openneuro-server/src/datalad/files.js +++ b/packages/openneuro-server/src/datalad/files.js @@ -61,19 +61,17 @@ export const computeTotalSize = files => * Get files for a specific revision * Similar to getDraftFiles but different cache key and fixed revisions * @param {string} datasetId - Dataset accession number - * @param {string} hexsha - Git treeish hexsha + * @param {string} treeish - Git treeish hexsha */ -export const getFiles = (datasetId, hexsha) => { +export const getFiles = (datasetId, treeish) => { const cache = new CacheItem(redis, CacheType.commitFiles, [ datasetId, - hexsha.substring(0, 7), + treeish.substring(0, 7), ]) return cache.get(() => request .get( - `${getDatasetWorker( - datasetId, - )}/datasets/${datasetId}/snapshots/${hexsha}/files`, + `${getDatasetWorker(datasetId)}/datasets/${datasetId}/tree/${treeish}`, ) .set('Accept', 'application/json') .then(response => { @@ -81,10 +79,9 @@ export const getFiles = (datasetId, hexsha) => { const { body: { files }, } = response - const size = computeTotalSize(files) return { - files: files.map(addFileUrl(datasetId, hexsha)), - size, + files: files.map(addFileUrl(datasetId, treeish)), + size: 128, } } }), diff --git a/packages/openneuro-server/src/datalad/snapshots.js b/packages/openneuro-server/src/datalad/snapshots.js index 23146d4210..d381900b71 100644 --- a/packages/openneuro-server/src/datalad/snapshots.js +++ b/packages/openneuro-server/src/datalad/snapshots.js @@ -155,9 +155,10 @@ export const createSnapshot = async ( snapshotChanges, ) snapshot.created = new Date() - const { files, size } = await getFiles(datasetId, tag) + const { files } = await getFiles(datasetId, tag) snapshot.files = files - snapshot.size = size + // TODO - Use a different path for size + snapshot.size = 128 await Promise.all([ // Update the draft status in datasets collection in case any changes were made (DOI, License) diff --git a/packages/openneuro-server/src/graphql/resolvers/draft.js b/packages/openneuro-server/src/graphql/resolvers/draft.js index 2d185446c7..49928a2371 100644 --- a/packages/openneuro-server/src/graphql/resolvers/draft.js +++ b/packages/openneuro-server/src/graphql/resolvers/draft.js @@ -4,21 +4,20 @@ import { description } from './description.js' import { readme } from './readme.js' import { getDraftRevision, updateDatasetRevision } from '../../datalad/draft.js' import { checkDatasetWrite } from '../permissions.js' -import { getFiles, filterFiles } from '../../datalad/files.js' +import { getFiles } from '../../datalad/files.js' import { filterRemovedAnnexObjects } from '../utils/file.js' // A draft must have a dataset parent const draftFiles = async (dataset, args, { userInfo }) => { const hexsha = await getDraftRevision(dataset.id) - const { files } = await getFiles(dataset.id, hexsha) - const prefixFiltered = filterFiles('prefix' in args && args.prefix)(files) - return filterRemovedAnnexObjects(dataset.id, userInfo)(prefixFiltered) + const { files } = await getFiles(dataset.id, args.tree || hexsha) + return filterRemovedAnnexObjects(dataset.id, userInfo)(files) } const draftSize = async (dataset, args, { userInfo }) => { const hexsha = await getDraftRevision(dataset.id) - const { size } = await getFiles(dataset.id, hexsha) - return size + // TODO - Implement a different method for size + return 128 } /** diff --git a/packages/openneuro-server/src/graphql/resolvers/snapshots.js b/packages/openneuro-server/src/graphql/resolvers/snapshots.js index 67892c996b..ebc13295b1 100644 --- a/packages/openneuro-server/src/graphql/resolvers/snapshots.js +++ b/packages/openneuro-server/src/graphql/resolvers/snapshots.js @@ -6,7 +6,7 @@ import { readme } from './readme.js' import { description } from './description.js' import { summary } from './summary.js' import { snapshotIssues } from './issues.js' -import { getFiles, filterFiles } from '../../datalad/files.js' +import { getFiles } from '../../datalad/files.js' import DatasetModel from '../../models/dataset' import { filterRemovedAnnexObjects } from '../utils/file.js' import DeprecatedSnapshot from '../../models/deprecatedSnapshot' @@ -28,11 +28,11 @@ export const snapshot = (obj, { datasetId, tag }, context) => { description: () => description(snapshot), readme: () => readme(snapshot), summary: () => summary({ id: datasetId, revision: snapshot.hexsha }), - files: ({ prefix }) => - getFiles(datasetId, snapshot.hexsha) + files: ({ tree }) => { + getFiles(datasetId, tree || snapshot.hexsha) .then(response => response.files) - .then(filterFiles(prefix)) - .then(filterRemovedAnnexObjects(datasetId, context.userInfo)), + .then(filterRemovedAnnexObjects(datasetId, context.userInfo)) + }, size: () => getFiles(datasetId, snapshot.hexsha).then(response => response.size), deprecated: () => deprecated({ datasetId, tag }), diff --git a/packages/openneuro-server/src/graphql/schema.js b/packages/openneuro-server/src/graphql/schema.js index 86316914a4..e316c8db9d 100644 --- a/packages/openneuro-server/src/graphql/schema.js +++ b/packages/openneuro-server/src/graphql/schema.js @@ -434,7 +434,7 @@ export const typeDefs = ` # Validator issues issues: [ValidationIssue] # Committed files in the working tree - files(prefix: String = ""): [DatasetFile] + files(tree: String): [DatasetFile] # dataset_description.json fields description: Description # Dataset README @@ -461,7 +461,7 @@ export const typeDefs = ` # bids-validator issues for this snapshot issues: [ValidationIssue] # Snapshot files - files(prefix: String = ""): [DatasetFile] + files(tree: String): [DatasetFile] # dataset_description.json fields description: Description # Snapshot usage and download statistics @@ -689,7 +689,6 @@ export const typeDefs = ` size: BigInt annexed: Boolean urls: [String] - objectpath: String # Return a flag if this is a directory which contains more files directory: Boolean } From f4a3aa77603acd6ef3144b4ff09642ad1a60c264 Mon Sep 17 00:00:00 2001 From: Nell Hardcastle Date: Thu, 29 Sep 2022 16:31:30 -0700 Subject: [PATCH 03/34] refactor(app): Adopt git tree loading for dataset page file tree --- .../files/__tests__/flat-to-tree.spec.js | 55 --------- .../files/file-tree-unloaded-directory.jsx | 44 ++++--- .../src/scripts/dataset/files/file-tree.jsx | 110 +++++++++++------- .../src/scripts/dataset/files/files.jsx | 8 +- .../src/scripts/dataset/files/flat-to-tree.js | 49 -------- 5 files changed, 94 insertions(+), 172 deletions(-) delete mode 100644 packages/openneuro-app/src/scripts/dataset/files/__tests__/flat-to-tree.spec.js delete mode 100644 packages/openneuro-app/src/scripts/dataset/files/flat-to-tree.js diff --git a/packages/openneuro-app/src/scripts/dataset/files/__tests__/flat-to-tree.spec.js b/packages/openneuro-app/src/scripts/dataset/files/__tests__/flat-to-tree.spec.js deleted file mode 100644 index 8ca64c93f8..0000000000 --- a/packages/openneuro-app/src/scripts/dataset/files/__tests__/flat-to-tree.spec.js +++ /dev/null @@ -1,55 +0,0 @@ -import { flatToTree } from '../flat-to-tree.js' - -const CHANGES = Object.freeze({ - id: '3d9b15b3ef4e9da06e265e6078d3b4ddf8495102', - filename: 'CHANGES', - size: 39, -}) - -const nifti = Object.freeze({ - id: '50512c7261fc006eb59bfd16f2a9d3140c9efe62', - filename: 'sub-01/anat/sub-01_T1w.nii.gz', - size: 311112, -}) - -const sub01Unloaded = Object.freeze({ - id: 'directory:sub-01', - filename: 'sub-01', - size: 1, - directory: true, -}) - -const exampleFiles = [CHANGES, nifti] - -describe('FileTree', () => { - describe('flatToTree()', () => { - it('accepts an array and returns a tree', () => { - expect(flatToTree(exampleFiles)).toEqual({ - name: '', - files: [CHANGES], - directories: [ - { - name: 'sub-01', - path: 'sub-01', - files: [], - directories: [ - { - name: 'anat', - path: 'sub-01:anat', - files: [{ ...nifti, filename: 'sub-01_T1w.nii.gz' }], - directories: [], - }, - ], - }, - ], - }) - }) - it('accepts directory stubs and returns them as directories', () => { - expect(flatToTree([CHANGES, sub01Unloaded])).toEqual({ - name: '', - files: [CHANGES], - directories: [{ ...sub01Unloaded, name: sub01Unloaded.filename }], - }) - }) - }) -}) diff --git a/packages/openneuro-app/src/scripts/dataset/files/file-tree-unloaded-directory.jsx b/packages/openneuro-app/src/scripts/dataset/files/file-tree-unloaded-directory.jsx index 4d85da1408..cbf0918d9c 100644 --- a/packages/openneuro-app/src/scripts/dataset/files/file-tree-unloaded-directory.jsx +++ b/packages/openneuro-app/src/scripts/dataset/files/file-tree-unloaded-directory.jsx @@ -6,10 +6,10 @@ import { gql } from '@apollo/client' import { AccordionTab } from '@openneuro/components/accordion' export const DRAFT_FILES_QUERY = gql` - query dataset($datasetId: ID!, $filePrefix: String!) { + query dataset($datasetId: ID!, $tree: String!) { dataset(id: $datasetId) { draft { - files(prefix: $filePrefix) { + files(tree: $tree) { id key filename @@ -23,9 +23,9 @@ export const DRAFT_FILES_QUERY = gql` ` export const SNAPSHOT_FILES_QUERY = gql` - query snapshot($datasetId: ID!, $snapshotTag: String!, $filePrefix: String!) { + query snapshot($datasetId: ID!, $snapshotTag: String!, $tree: String!) { snapshot(datasetId: $datasetId, tag: $snapshotTag) { - files(prefix: $filePrefix) { + files(tree: $tree) { id key filename @@ -37,24 +37,30 @@ export const SNAPSHOT_FILES_QUERY = gql` } ` +/** + * Prepend paths to the tree object returned to get absolute filenames + */ +export const nestFiles = path => file => ({ + ...file, + filename: `${path}:${file.filename}`, +}) + +/** + * Merge cached dataset files with newly received data + */ export const mergeNewFiles = (directory, snapshotTag) => (past, { fetchMoreResult }) => { // Deep clone the old dataset object + const path = directory.filename const newDatasetObj = JSON.parse(JSON.stringify(past)) - const mergeNewFileFilter = f => f.id !== directory.id - // Remove ourselves from the array - if (snapshotTag) { - newDatasetObj.snapshot.files = - newDatasetObj.snapshot.files.filter(mergeNewFileFilter) - newDatasetObj.snapshot.files.push(...fetchMoreResult.snapshot.files) - } else { - newDatasetObj.dataset.draft.files = - newDatasetObj.dataset.draft.files.filter(mergeNewFileFilter) - newDatasetObj.dataset.draft.files.push( - ...fetchMoreResult.dataset.draft.files, - ) - } + const newFiles = snapshotTag + ? newDatasetObj.snapshot.files + : newDatasetObj.dataset.draft.files + const fetchMoreData = snapshotTag + ? fetchMoreResult.snapshot + : fetchMoreResult.dataset.draft + newFiles.push(...fetchMoreData.files.map(nestFiles(path))) return newDatasetObj } @@ -66,7 +72,7 @@ export const fetchMoreDirectory = ( ) => fetchMore({ query: snapshotTag ? SNAPSHOT_FILES_QUERY : DRAFT_FILES_QUERY, - variables: { datasetId, snapshotTag, filePrefix: directory.filename + '/' }, + variables: { datasetId, snapshotTag, tree: directory.id }, updateQuery: mergeNewFiles(directory, snapshotTag), }) @@ -82,7 +88,7 @@ const FileTreeUnloadedDirectory = ({ datasetId, snapshotTag, directory }) => { }, [loading]) return ( { // Show a loading state while we wait on the directory to stream in diff --git a/packages/openneuro-app/src/scripts/dataset/files/file-tree.jsx b/packages/openneuro-app/src/scripts/dataset/files/file-tree.jsx index c6d62f5bc0..3d8cdbd4cc 100644 --- a/packages/openneuro-app/src/scripts/dataset/files/file-tree.jsx +++ b/packages/openneuro-app/src/scripts/dataset/files/file-tree.jsx @@ -7,21 +7,14 @@ import FileTreeUnloadedDirectory from './file-tree-unloaded-directory.jsx' import { Media } from '../../styles/media' import { AccordionTab } from '@openneuro/components/accordion' -export const sortByFilename = (a, b) => a.filename.localeCompare(b.filename) - -export const sortByName = (a, b) => a.name.localeCompare(b.name) - export const unescapePath = path => path.replace(/:/g, '/') -const isTopLevel = dir => !dir.path.includes(':') - const FileTree = ({ datasetId, snapshotTag = null, path = '', name = '', files = [], - directories = [], editMode = false, defaultExpanded = false, datasetPermissions, @@ -29,10 +22,33 @@ const FileTree = ({ isFileToBeDeleted, bulkDeleteButton, }) => { + // Split files into a tree for this level and child levels + // Special cases for root (path === '') + const currentFiles = [] + const childFiles = {} + for (const f of files) { + // Any paths in this filename below the current path value + const lowerPath = f.filename.substring(`${path}:`.length) + if (path === '' ? f.filename.includes(':') : lowerPath.includes(':')) { + // At the top level, use the directory component (first segment) + // Below that, use all paths before the filename (sub-01:anat) for (sub-01:anat:sub-01_T1w.nii.gz) + const childPath = + path === '' + ? f.filename.split(':')[0] + : f.filename.split(':').slice(0, -1).join(':') + if (childFiles.hasOwnProperty(childPath)) { + childFiles[childPath].push(f) + } else { + childFiles[childPath] = [f] + } + } else { + currentFiles.push(f) + } + } return ( {editMode && ( @@ -59,48 +75,50 @@ const FileTree = ({ )}