one_offline prototype

Yiman00 · Jul 31, 2020 · 40dad3b · 40dad3b
1 parent 5ed59a9
commit 40dad3b
Show file tree

Hide file tree

Showing 12 changed files with 354 additions and 135 deletions.
diff --git a/brainbox/brainbox_env.yml b/brainbox/brainbox_env.yml
diff --git a/brainbox/core/core.py b/brainbox/core/core.py
@@ -1,8 +1,7 @@
-'''
+"""
 Creates core data types and functions which support all of brainbox.
-'''
+"""
 from pathlib import Path
-
 import numpy as np
 import pandas as pd
 

diff --git a/brainbox/io/parquet.py b/brainbox/io/parquet.py
@@ -1,5 +1,31 @@
-import numpy as np
 import uuid
+import numpy as np
+
+from numba import jit
+import pyarrow.parquet as pq
+import pyarrow as pa
+import pandas as pd
+
+from brainbox.core import Bunch
+
+
+def load(file):
+    """
+    Loads parquet file into pandas dataframe
+    :param file:
+    :return:
+    """
+    return pq.read_table(file).to_pandas()
+
+
+def save(file, table):
+    """
+    Save pandas dataframe to parquet
+    :param file:
+    :param table:
+    :return:
+    """
+    pq.write_table(pa.Table.from_pandas(table), file)
 
 
 def uuid2np(eids_uuid):
@@ -8,12 +34,83 @@ def uuid2np(eids_uuid):
 
 
 def str2np(eids_str):
+    if isinstance(eids_str, str):
+        eids_str = [eids_str]
     return uuid2np([uuid.UUID(eid) for eid in eids_str])
 
 
 def np2uuid(eids_np):
+    if isinstance(eids_np, pd.DataFrame):
+        eids_np = eids_np.to_numpy()
     return [uuid.UUID(bytes=npu.tobytes()) for npu in eids_np]
 
 
 def np2str(eids_np):
     return [str(u) for u in np2uuid(eids_np)]
+
+
+def rec2col(rec, join=None, include=None, exclude=None, uuid_fields=None):
+    """
+    Change a record list (usually from a REST API endpoint) to a column based dictionary
+    (pandas dataframe).
+    :param rec (list): list of dictionaries with consistent keys
+    :param join (dictionary): dictionary of scalar keys that will be replicated over the full
+    array (join operation)
+    :param include: list of strings representing dictionary keys: if specified will only include
+    the keys specified here
+    :param exclude: list of strings representing dictionary keys: if specified will exclude the
+    keys specified here
+    :param uuid_fields: if the field is an UUID, will split it into 2 distinct int64 columns for
+    efficient lookups and intersections
+    :return: a Bunch
+    """
+    if isinstance(rec, dict):
+        rec = [rec]
+    if len(rec) == 0:
+        return Bunch()
+    if include is None:
+        include = rec.keys()
+    if exclude is None:
+        exclude = []
+    if uuid_fields is None:
+        uuid_fields = []
+    if join is None:
+        join = {}
+
+    # first loop over the records and create each columns as a numpy array
+    nrecs = len(rec)
+    col = {}
+    keys = [k for k in rec[0] if k in include and k not in exclude]
+    for key in keys:
+        if key in uuid_fields:
+            npuuid = str2np(np.array([c[key] for c in rec]))
+            col[f"{key}_0"] = npuuid[:, 0]
+            col[f"{key}_1"] = npuuid[:, 1]
+        else:
+            col[key] = np.array([c[key] for c in rec])
+
+    # then perform the joins if any
+    for key in join:
+        if key in uuid_fields:
+            npuuid = str2np([join[key]])
+            col[f"{key}_0"] = np.tile(npuuid[0, 0], (nrecs,))
+            col[f"{key}_1"] = np.tile(npuuid[0, 1], (nrecs,))
+        else:
+            col[key] = np.tile(np.array(join[key]), (nrecs,))
+
+    return Bunch(col)
+
+
+@jit(nopython=True)
+def find_first_2d(mat, val):
+    """
+    Returns first index where
+    The purpose of this function is performance: uses low level numba and avoids looping
+    through the full array
+    :param mat: np.array
+    :param val: values to search for
+    :return: index or empty array
+    """
+    for i in np.arange(mat.shape[0]):
+        if np.all(mat[i] == val):
+            return i
diff --git a/brainbox/tests/ephys_test.p → brainbox/tests/fixtures/ephys_test.p b/brainbox/tests/ephys_test.p → brainbox/tests/fixtures/ephys_test.p
diff --git a/brainbox/tests/fixtures/parquet_records.json b/brainbox/tests/fixtures/parquet_records.json
@@ -0,0 +1 @@
+[{"id": "490dbe0e-8d96-44ef-bb3f-3443285376e4", "name": "_ibl_trials.intervals.npy", "dataset_type": "trials.intervals", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/alf/_ibl_trials.intervals.490dbe0e-8d96-44ef-bb3f-3443285376e4.npy", "url": "https://alyx.internationalbrainlab.org/datasets/490dbe0e-8d96-44ef-bb3f-3443285376e4", "file_size": 5824, "hash": "a9d372ce849439c12243dd4c1bbf29d5", "version": "1.4.14", "collection": "alf"}, {"id": "6b6aeba4-a6c2-4a42-aa8c-b2f12e790623", "name": "_ibl_log.info.register_v1.4.11.log", "dataset_type": "_ibl_log.info", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/logs/_ibl_log.info.register_v1.4.11.6b6aeba4-a6c2-4a42-aa8c-b2f12e790623.log", "url": "https://alyx.internationalbrainlab.org/datasets/6b6aeba4-a6c2-4a42-aa8c-b2f12e790623", "file_size": 146, "hash": "fe2dd9fee9720049b02fe318165222bf", "version": "1.4.14", "collection": "logs"}, {"id": "1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb", "name": "_iblrig_bodyCamera.raw.mp4", "dataset_type": "_iblrig_Camera.raw", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_video_data/_iblrig_bodyCamera.raw.1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb.mp4", "url": "https://alyx.internationalbrainlab.org/datasets/1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb", "file_size": 741255803, "hash": "c8e605c3112c639ac7fd7a1a1e1e782e", "version": "1.4.14", "collection": "raw_video_data"}, {"id": "88ca7cad-6bba-49f3-af69-1056a4099474", "name": "_spikeglx_ephysData_g1_t0.imec.lf.cbin", "dataset_type": "ephysData.raw.lf", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_ephys_data/probe01/_spikeglx_ephysData_g1_t0.imec.lf.88ca7cad-6bba-49f3-af69-1056a4099474.cbin", "url": "https://alyx.internationalbrainlab.org/datasets/88ca7cad-6bba-49f3-af69-1056a4099474", "file_size": 3276657550, "hash": null, "version": "1.4.14", "collection": "raw_ephys_data/probe01"}, {"id": "3e7f3a3d-5992-4d3e-86b2-69016464beae", "name": "_spikeglx_sync.polarities.probe00.npy", "dataset_type": "_spikeglx_sync.polarities", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_ephys_data/probe00/_spikeglx_sync.polarities.probe00.3e7f3a3d-5992-4d3e-86b2-69016464beae.npy", "url": "https://alyx.internationalbrainlab.org/datasets/3e7f3a3d-5992-4d3e-86b2-69016464beae", "file_size": 20475896, "hash": "22ee1f879df4c0705474586b64412ea3", "version": "1.4.14", "collection": "raw_ephys_data/probe00"}]
diff --git a/brainbox/tests/test_io.py b/brainbox/tests/test_io.py
@@ -1,14 +1,35 @@
+import json
+from pathlib import Path
 import unittest
 import uuid
 
 import numpy as np
 
 from brainbox.core import intersect2d, ismember2d, ismember
-from brainbox.io.parquet import uuid2np, np2uuid
+from brainbox.io.parquet import uuid2np, np2uuid, rec2col
 
 
 class TestParquet(unittest.TestCase):
 
+    def test_rec2col(self):
+        json_fixture = Path(__file__).parent.joinpath('fixtures', 'parquet_records.json')
+        with open(json_fixture, 'r') as fid:
+            datasets = json.loads(fid.read())
+        # test with includes / joins and uuid fields in both join and includes
+        include = ['id', 'hash', 'dataset_type', 'name', 'file_size', 'collection']
+        uuid_fields = ['id', 'eid']
+        join = {'subject': 'Bernard', 'lab': 'thelab',
+                'eid': '150f92bc-e755-4f54-96c1-84e1eaf832b4'}
+        arr = rec2col(datasets, include=include, uuid_fields=uuid_fields, join=join)
+        self.assertTrue(np.all(np.array([arr[k].size for k in arr]) == 5))
+        self.assertTrue(len(arr.keys()) == len(include) + len(uuid_fields) + len(join.keys()))
+        # test single dictionary
+        arr_single = rec2col(datasets[0], include=include, uuid_fields=uuid_fields, join=join)
+        self.assertTrue(np.all(arr.to_df().iloc[0] == arr_single.to_df()))
+        # test empty
+        arr_empty = rec2col([], include=include, uuid_fields=uuid_fields, join=join)
+        self.assertTrue(arr_empty.to_df().size == 0)
+
     def test_uuids_intersections(self):
         ntotal = 500
         nsub = 17

diff --git a/brainbox/tests/test_task.py b/brainbox/tests/test_task.py
@@ -9,7 +9,7 @@ class TestTask(unittest.TestCase):
 
     def setUp(self):
         # Test data is a dictionary of spike times and clusters and event times and groups
-        pickle_file = Path(__file__).parent.joinpath('ephys_test.p')
+        pickle_file = Path(__file__).parent.joinpath('fixtures', 'ephys_test.p')
         if not pickle_file.exists():
             self.test_data = None
         else:

diff --git a/ibllib/io/extractors/ephys_fpga.py b/ibllib/io/extractors/ephys_fpga.py
@@ -417,14 +417,10 @@ def extract_behaviour_sync(sync, chmap=None, display=False, tmax=np.inf):
         else:
             ax = display
         r0 = _get_sync_fronts(sync, chmap['rotary_encoder_0'])
-        plots.squares(bpod['times'], bpod['polarities'] * 0.4 + 1,
-                      ax=ax, color='k')
-        plots.squares(frame2ttl['times'], frame2ttl['polarities'] * 0.4 + 2,
-                      ax=ax, color='k')
-        plots.squares(audio['times'], audio['polarities'] * 0.4 + 3,
-                      ax=ax, color='k')
-        plots.squares(r0['times'], r0['polarities'] * 0.4 + 4,
-                      ax=ax, color='k')
+        plots.squares(bpod['times'], bpod['polarities'] * 0.4 + 1, ax=ax, color='k')
+        plots.squares(frame2ttl['times'], frame2ttl['polarities'] * 0.4 + 2, ax=ax, color='k')
+        plots.squares(audio['times'], audio['polarities'] * 0.4 + 3, ax=ax, color='k')
+        plots.squares(r0['times'], r0['polarities'] * 0.4 + 4, ax=ax, color='k')
         plots.vertical_lines(t_ready_tone_in, ymin=0, ymax=ymax,
                              ax=ax, label='goCue_times', color='b', linewidth=width)
         plots.vertical_lines(t_trial_start, ymin=0, ymax=ymax,
@@ -439,6 +435,12 @@ def extract_behaviour_sync(sync, chmap=None, display=False, tmax=np.inf):
                              ax=ax, label='stim off', color='c', linewidth=width)
         plots.vertical_lines(trials['stimOn_times'], ymin=0, ymax=ymax,
                              ax=ax, label='stimOn_times', color='tab:orange', linewidth=width)
+        c = _get_sync_fronts(sync, chmap['left_camera'])
+        plots.squares(c['times'], c['polarities'] * 0.4 + 5, ax=ax, color='k')
+        c = _get_sync_fronts(sync, chmap['right_camera'])
+        plots.squares(c['times'], c['polarities'] * 0.4 + 6, ax=ax, color='k')
+        c = _get_sync_fronts(sync, chmap['body_camera'])
+        plots.squares(c['times'], c['polarities'] * 0.4 + 7, ax=ax, color='k')
         ax.legend()
         ax.set_yticklabels(['', 'bpod', 'f2ttl', 'audio', 're_0', ''])
         ax.set_yticks([0, 1, 2, 3, 4, 5])

diff --git a/ibllib/io/one.py b/ibllib/io/one.py
diff --git a/ibllib/qc/oneutils.py b/ibllib/qc/oneutils.py
@@ -10,7 +10,7 @@
 
 log = logging.getLogger("ibllib")
 
-one = ONE(printout=False)
+one = ONE()
 
 
 def download_bpodqc_raw_data(eid, one=None):

diff --git a/oneibl/dataclass.py b/oneibl/dataclass.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass, field, fields
 from pathlib import Path
 
+from brainbox.io import parquet
 from ibllib.misc import flatten
 
 
@@ -64,9 +65,9 @@ def from_datasets(dsets, dataset_types=None, eid=None):
             dataset_type=[d['dataset_type'] for d in dsets],
             dataset_id=[d['id'] for d in dsets],
             local_path=[None for d in dsets],
-            eid=[eid for d in dsets],  # [ses_info['url'][-36:] for d in dsets],
+            eid=[eid for _ in dsets],  # [ses_info['url'][-36:] for d in dsets],
             url=[d['data_url'] for d in dsets],
-            data=[None for d in dsets],
+            data=[None for _ in dsets],
             hash=[d['hash'] for d in dsets],
             file_size=[d['file_size'] for d in dsets]
         )
@@ -75,6 +76,23 @@ def from_datasets(dsets, dataset_types=None, eid=None):
     def from_session_details(ses_info, **kwargs):
         return _session_details_to_dataclasses(ses_info, **kwargs)
 
+    @staticmethod
+    def from_pandas(df, cache_dir):
+        fcn_local_path = lambda rec: Path(cache_dir).joinpath(  # noqa
+            rec['lab'], 'Subjects', rec['subject'], rec['start_time'][:10],
+            str(rec['number']).zfill(3), rec['collection'], rec['name'])
+        nrecs = df.shape[0]
+        return SessionDataInfo(
+            dataset_type=df.dataset_type.to_list(),
+            dataset_id=list(parquet.np2str(df[['id_0', 'id_1']])),
+            local_path=df.apply(fcn_local_path, axis=1).to_list(),
+            eid=list(parquet.np2str(df[['eid_0', 'eid_1']])),
+            url=[None for _ in range(nrecs)],
+            data=[None for _ in range(nrecs)],
+            hash=df.hash.to_list(),
+            file_size=df.file_size.to_list()
+        )
+
 
 @singledispatch
 def _session_details_to_dataclasses(ses_info, **kwargs):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		[{"id": "490dbe0e-8d96-44ef-bb3f-3443285376e4", "name": "_ibl_trials.intervals.npy", "dataset_type": "trials.intervals", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/alf/_ibl_trials.intervals.490dbe0e-8d96-44ef-bb3f-3443285376e4.npy", "url": "https://alyx.internationalbrainlab.org/datasets/490dbe0e-8d96-44ef-bb3f-3443285376e4", "file_size": 5824, "hash": "a9d372ce849439c12243dd4c1bbf29d5", "version": "1.4.14", "collection": "alf"}, {"id": "6b6aeba4-a6c2-4a42-aa8c-b2f12e790623", "name": "_ibl_log.info.register_v1.4.11.log", "dataset_type": "_ibl_log.info", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/logs/_ibl_log.info.register_v1.4.11.6b6aeba4-a6c2-4a42-aa8c-b2f12e790623.log", "url": "https://alyx.internationalbrainlab.org/datasets/6b6aeba4-a6c2-4a42-aa8c-b2f12e790623", "file_size": 146, "hash": "fe2dd9fee9720049b02fe318165222bf", "version": "1.4.14", "collection": "logs"}, {"id": "1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb", "name": "_iblrig_bodyCamera.raw.mp4", "dataset_type": "_iblrig_Camera.raw", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_video_data/_iblrig_bodyCamera.raw.1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb.mp4", "url": "https://alyx.internationalbrainlab.org/datasets/1c633c8c-7b04-458f-b7a8-c1ac62ce9dcb", "file_size": 741255803, "hash": "c8e605c3112c639ac7fd7a1a1e1e782e", "version": "1.4.14", "collection": "raw_video_data"}, {"id": "88ca7cad-6bba-49f3-af69-1056a4099474", "name": "_spikeglx_ephysData_g1_t0.imec.lf.cbin", "dataset_type": "ephysData.raw.lf", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_ephys_data/probe01/_spikeglx_ephysData_g1_t0.imec.lf.88ca7cad-6bba-49f3-af69-1056a4099474.cbin", "url": "https://alyx.internationalbrainlab.org/datasets/88ca7cad-6bba-49f3-af69-1056a4099474", "file_size": 3276657550, "hash": null, "version": "1.4.14", "collection": "raw_ephys_data/probe01"}, {"id": "3e7f3a3d-5992-4d3e-86b2-69016464beae", "name": "_spikeglx_sync.polarities.probe00.npy", "dataset_type": "_spikeglx_sync.polarities", "data_url": "http://ibl.flatironinstitute.org/churchlandlab/Subjects/CSHL046/2020-06-20/002/raw_ephys_data/probe00/_spikeglx_sync.polarities.probe00.3e7f3a3d-5992-4d3e-86b2-69016464beae.npy", "url": "https://alyx.internationalbrainlab.org/datasets/3e7f3a3d-5992-4d3e-86b2-69016464beae", "file_size": 20475896, "hash": "22ee1f879df4c0705474586b64412ea3", "version": "1.4.14", "collection": "raw_ephys_data/probe00"}]