From 00c097de2e2c9ea1d97cd67cb4120bec45f16c5b Mon Sep 17 00:00:00 2001 From: Angelo Romano Date: Fri, 24 Jan 2025 16:28:25 +0100 Subject: [PATCH 1/5] [CADL-41] Integration tests --- py311/fixture_files/sample-1-expected.csv | 4 + py311/test_dataset.py | 1284 +++++++++++++++++++++ 2 files changed, 1288 insertions(+) create mode 100644 py311/fixture_files/sample-1-expected.csv create mode 100644 py311/test_dataset.py diff --git a/py311/fixture_files/sample-1-expected.csv b/py311/fixture_files/sample-1-expected.csv new file mode 100644 index 0000000..05bdabd --- /dev/null +++ b/py311/fixture_files/sample-1-expected.csv @@ -0,0 +1,4 @@ +Q1,Q3,Q5,Q6,Q7,Q99,Q2_1,Q2_3,Q2_2,Q2_5,Q2_4,Q2_9,Q4_1,Q4_2,Q4_4,Q4_9,Q4_3,Q4_5 +2,Amusement Park,12.0,1,1,1.4,1,1,0,0,0,1,1,0,0,1,0,0 +3,,999.0,1,3,0.9,0,0,1,0,0,0,0,1,0,0,0,0 +2,"""Marco's"" Restaurant",58.0,0,,0.7,1,0,0,0,1,1,0,0,1,1,0,0 diff --git a/py311/test_dataset.py b/py311/test_dataset.py new file mode 100644 index 0000000..ed4e0c6 --- /dev/null +++ b/py311/test_dataset.py @@ -0,0 +1,1284 @@ +# coding: utf-8 +import codecs +import csv +import os +import tempfile +import uuid +from datetime import datetime +from typing import Any, Dict, Optional, Union + +import numpy +import pyspssio +from numpy.testing import assert_equal as numpy_assert_equal +from pycrunch.shoji import Entity, as_entity, wait_progress + +from integration.fixtures import BaseIntegrationTestCase +from scrunch.cubes import crtabs +from scrunch.datasets import Project +from scrunch.streaming_dataset import StreamingDataset + +PROJECT_ID = os.environ.get("SCRUNCH_PROJECT_ID") +PROJECT_311_ID = os.environ.get("SCRUNCH_PROJECT_311_ID") +TEST_DATASET_ID = os.environ.get("SCRUNCH_TEST_DATASET_ID") + + +class ST: + """Values the `.type` attribute of a Source object can take""" + + CSV = "csv" + JSON = "json" + LDJSON = "ldjson" + ZCL = "zcl" + PARQUET = "pqt" + SPSS = "spss" + SSS_METADATA = "sss-metadata" + SSS_DATA = "sss-data" + CRUNCH_METADATA = "crunch-metadata" + CSV_TASK_TEST = "csv_task" # for testing only until csv is moved over to tasks + SPSS_TASK_TEST = "spss_task" # for testing only until csv is moved over to tasks + + +# List of source filetypes from cr.server +source_filetypes = { + "application/x-ldjson": ST.LDJSON, + "application/x-spss-sav": ST.SPSS, + "text/csv": ST.CSV, + "text/json": ST.JSON, + "text/ldjson": ST.LDJSON, + "text/plain": ST.CSV, + "text/xml": ST.SSS_METADATA, + "application/x-crunch-metadata+json": ST.CRUNCH_METADATA, + "application/x-parquet": ST.PARQUET, + "application/x-ndjson": ST.ZCL, +} + +source_mimetypes = {} +for mimetype, val in source_filetypes.items(): + if val not in source_mimetypes: + source_mimetypes[val] = [] + source_mimetypes[val].append(mimetype) +source_mimetypes["txt"] = ["text/csv"] # Backward compatibility + + +def ensure_binary( + s: Union[str, bytes], encoding: str = "utf-8", errors: str = "strict" +) -> bytes: + """Coerce **s** to bytes. + - `str` -> encoded to `bytes` + - `bytes` -> `bytes` + + :param s: The contents to coerce. + :param encoding: Encoding type (default to UTF-8). + :param errors: Error handling level in encoding (default to strict). + """ + if isinstance(s, bytes): + return s + if isinstance(s, str): + return s.encode(encoding, errors) + raise TypeError(f"not expecting type '{type(s)}'") + + +BOUNDARY = "________ThIs_Is_tHe_bouNdaRY_$" + + +def encode_multipart_formdata(files): + """Return (content_type, body) ready for httplib.HTTP instance. + + files: a sequence of (name, filename, value) tuples for multipart uploads. + """ + lines = [] + for key, filename, value in files: + lines.append("--" + BOUNDARY) + if filename is None: + lines.append(f'Content-Disposition: form-data; name="{key}"\r\n\r\n{value}') + continue + lines.append( + f'Content-Disposition: form-data; name="{key}"; filename="{filename}"' + ) + ct = source_mimetypes.get( + filename.rsplit(".", 1)[-1], ["application/octet-stream"] + )[0] + lines.append(f"Content-Type: {ct}") + lines.append("") + lines.append(value) + lines.append("--" + BOUNDARY + "--") + lines.append("") + body = "\r\n".join(lines) + content_type = f"multipart/form-data; charset=UTF-8; boundary={BOUNDARY}" + return content_type, body + + +def encode_formdata_item(content, content_type=None, **params): + body = "--%s\r\n" "Content-Disposition: form-data; %s\r\n" "%s" "\r\n" "%s\r\n" + params = "; ".join('%s="%s"' % (a, b) for a, b in params.items()) + + content_type = "Content-Type: %s\r\n" % content_type if content_type else "" + return body % (BOUNDARY, params, content_type, content) + + +class BaseTestCase(BaseIntegrationTestCase): + TEST_FUNCTIONS = [] + _created_datasets = None + weight = None + + def _test_file_bytes(self, filename): + """Return str (bytes) content of test file with *filename*. + + Test files are located in the `tests/files/` directory. + """ + file_path = os.path.join(filename) + with codecs.open(file_path, "rb", "latin1") as f: + contents = f.read() + + return contents.encode().decode("utf8") + + def _encode_file_as_multipart(self, field_name, filename, content_type, contents): + """Return (content_type, body) containing specified file encoded for upload. + + The returned *content_type* is the "multipart/form-data ..." content-type header + value for the request, including the boundary string used as a suffix. Note this + is *not* the same content-type as the file to be uploaded, which is specified as + a parameter. + + *field_name* is the form field name by which the file will be identified in the + HTTP request. *filename* should be the OS filename with extension (but no path) + such as "data.csv". *content_type* is the MIME-type of the file (distinct from + the `content_type` return value of this method. *contents* is a str (bytes, not + unicode) containing the content of the file. + """ + + body = encode_formdata_item( + contents, + content_type, + name=field_name, + filename=filename, + ) + body += "--%s--\r\n" % BOUNDARY + + content_type = f"multipart/form-data; charset=UTF-8; boundary={BOUNDARY}" + + return content_type, body + + def _parse_on_311(self, on_311: Union[None, bool]) -> bool: + """ + Based on the value of the parameters, returns True or False, based on whether we are + meant to run this on a Python 3.11 factory or not. + + This coincides to the value of `on_311` in case it is one of `True` or `False`. + When the value is `None`, the value corresponds to the current Python version bound with + the current class - i.e., `True` if `CURRENT_VERSION` is `3.11`, `False` otherwise. + """ + if on_311 is None: + return False if self.CURRENT_VERSION == "3.6" else True + return on_311 + + def _import_dataset( + self, + metadata: Dict[str, Any], + input_file: str, + on_311: Optional[bool] = None, + format_: str = "csv", + ): + """ + :param metadata: The metadata fields associated to the dataset we are creating. + :param input_file: The input file. + :param on_311: Whether to run the import under Python 3.11 or not. Default to `None` (same + setting as the class). + + """ + on_311 = self._parse_on_311(on_311) + input_fullpath = os.path.abspath( + os.path.join(".", "py311", "fixture_files", input_file) + ) + name = ( + "Weighed imported test dataset" if self.weight else "Imported test dataset" + ) + ds_data = {k: v for k, v in metadata.items()} + ds_data["name"] = ( + f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ) + project_id = PROJECT_311_ID if on_311 else PROJECT_ID + if project_id: + ds_data["project"] = f"/projects/{project_id}/" + # server/tests/controllers/test_sources.py + # streaming dataset + # steps + # 1. HTTP POST /sources/ {"uploaded_file": Binary} + # -> response: HTTP 201 - headers {"Location": SourceURL} + # 2. HTTP POST /datasets/ {"body": {"name": Str}} + # -> response: HTTP 201 - headers {"Location": DatasetURL} + # 3. HTTP POST /datasets/DID/batches/ {"savepoint": False, "body": + # {"workflow": [], "source": SourceURL}} + # -> response: HTTP 202 (in progress) / headers {"Location": BatchURL} + # 4. HTTP GET BatchURL + # -> response: HTTP 200/202 / {"value": {"progress": N, "message": + # Str}, "views": {"result": NextBatchURL}} + content_type, body = self._encode_file_as_multipart( + field_name="uploaded_file", + filename=input_file, + content_type=source_mimetypes[format_][0], + contents=self._test_file_bytes(input_fullpath), + ) + + poster = self.site.sources.post + content = body + resp = poster( + content, + headers={"Content-Type": content_type, "Content-Length": str(len(body))}, + ) + ds = self.site.datasets.create(as_entity(ds_data)).refresh() + resp = ds.batches.post( + { + "element": "shoji:entity", + "body": { + "source": resp.headers["Location"], + "workflow": [], + }, + "savepoint": False, + } + ) # .json()["value"] + wait_progress(resp, self.site.session) + return ds.refresh() + + def _export_dataset(self, ds, format_: str = "csv") -> Dict[str, Any]: + """ + Runs a dataset export. + + :param ds: The dataset. + :param format_: The export format (one of `csv` and `spss`). + """ + output = tempfile.NamedTemporaryFile(mode="w+t", delete=False) + ds.export(output.name, format=format_) + return self._parse_dataset_export(output, format_) + + def _run_script(self, ds, payload: dict): + """ + Runs an automation script against a dataset. + """ + resp = ds.scripts.post(payload) + assert resp.status_code == 202 + wait_progress(resp, self.site.session) + return ds.refresh() + + def _parse_dataset_export(self, output: str, format_: str = "csv"): + """ + Given an output file, parses it and returns the values for it. + """ + if format_ == "csv": + reader = csv.DictReader(output) + + # put the data into columns + actual = {} + for row in reader: + for k, v in row.items(): + actual.setdefault(k, []).append(v) + return {k: [o.strip() for o in v] for k, v in actual.items()} + elif format_ == "spss": + data, metadata = pyspssio.read_sav(output.name) + return {k: list(data.get(k)) for k in metadata["var_names"]} + + def setUp(self): + self._created_datasets = {} + super().setUp() + + def tearDown(self): + for ds, views in self._created_datasets.values(): + for view in views.values(): + view.delete() + ds.delete() + + super().tearDown() + + def _project(self, id: str) -> Project: + """ + Returns the scrunch project instance for a specific project ID. + """ + project = Project( + Entity( + self.site.session, + **{ + "self": f"{self.site.self}projects/{id}/", + "element": "shoji:entity", + "body": {"name": "Target project"}, + }, + ) + ) + return project + + def _log(self, msg: str): + print(msg) + + def _change_dataset_version(self, ds): + """ + Switches the current dataset project to the alternative option (i.e., from 3.6 to 3.11, or + the other way around). + """ + project_id = PROJECT_311_ID if self.CURRENT_VERSION == "3.6" else PROJECT_ID + ds.move(self._project(project_id)) + return ds + + def _revert_dataset_version(self, ds): + """ + Reverts the current dataset project to the original option. + """ + project_id = PROJECT_ID if self.CURRENT_VERSION == "3.6" else PROJECT_311_ID + ds.move(self._project(project_id)) + return ds + + def _create_view(self, ds, on_311=None, **values): + """ + Creates a test view. + """ + on_311 = self._parse_on_311(on_311) + ds_data = {k: v for k, v in values.items()} + name = values.pop("name", None) + ds_data["view_of"] = ds.self + if not name: + name = "Weighed test view dataset" if self.weight else "Test view dataset" + ds_data["name"] = ( + f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ) + project_id = PROJECT_311_ID if on_311 else PROJECT_ID + if project_id: + ds_data["project"] = f"/projects/{project_id}/" + view = self.site.datasets.create(as_entity(ds_data)).refresh() + self._created_datasets[ds.self][1][view.self] = view + if self.weight: + view.settings.patch( + {"weight": view.variables.by("alias")[self.weight].entity.self} + ) + streaming_view = StreamingDataset(view) + self._log(f"[{streaming_view.id}] {name} [project={project_id}]") + return streaming_view, view + + def _create_dataset(self, on_311=None, pk=None, **values): + """ + Creates a test dataset. + """ + on_311 = self._parse_on_311(on_311) + pk = values.pop("pk", None) + name = values.pop("name", None) + ds_data = {k: v for k, v in values.items()} + if not name: + name = "Weighed test dataset" if self.weight else "Test dataset" + ds_data["name"] = ( + f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ) + project_id = PROJECT_311_ID if on_311 else PROJECT_ID + if project_id: + ds_data["project"] = f"/projects/{project_id}/" + ds = self.site.datasets.create(as_entity(ds_data)).refresh() + if pk: + ds.variables.create( + as_entity( + { + "name": "pk", + "alias": "pk", + "type": "numeric", + "values": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ) + ) + ds.variables.create( + as_entity( + { + "name": "weight_var", + "alias": "weight_var", + "type": "numeric", + "values": [10, 8, 14, 10, 12, 9, 10, 11, 9, 7], + } + ) + ).refresh() + var1 = ds.variables.create( + as_entity( + { + "name": "A", + "alias": "A", + "type": "numeric", + "values": [1, 2, 3, None, 5, 6, 7, 8, None, 11], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "B", + "alias": "B", + "type": "numeric", + "values": [2, 3, 1, 5, None, 6, 8, 11, 7, None], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "DT", + "alias": "DT", + "type": "text", + "values": [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + None, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + None, + ], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "cat1", + "alias": "cat1", + "type": "categorical", + "categories": [ + { + "id": 1, + "name": "cat 1", + "missing": False, + "numeric_value": None, + }, + { + "id": 2, + "name": "cat 2", + "missing": False, + "numeric_value": None, + }, + { + "id": 3, + "name": "cat 3", + "missing": False, + "numeric_value": None, + }, + { + "id": -1, + "name": "No Data", + "missing": True, + "numeric_value": None, + }, + ], + "values": [1, 2, 3, -1, -1, -1, 1, 2, 3, 1], + } + ) + ) + + ds.variables.create( + as_entity( + { + "name": "cat2", + "alias": "cat2", + "type": "categorical", + "categories": [ + { + "id": 1, + "name": "cat b1", + "missing": False, + "numeric_value": None, + }, + { + "id": 2, + "name": "cat b2", + "missing": False, + "numeric_value": None, + }, + { + "id": 3, + "name": "cat b3", + "missing": False, + "numeric_value": None, + }, + { + "id": -1, + "name": "No Data", + "missing": True, + "numeric_value": None, + }, + ], + "values": [1, 3, 2, -1, 1, -1, 1, 2, 3, -1], + } + ) + ) + ds.variables.create( + as_entity( + { + "name": "der1", + "alias": "der1", + "derived": True, + "derivation": { + "function": "+", + "args": [ + {"variable": var1.self}, + {"value": 1}, + ], + }, + } + ) + ) + self._created_datasets[ds.self] = (ds, {}) + if self.weight: + ds.settings.patch( + {"weight": ds.variables.by("alias")[self.weight].entity.self} + ) + ds.preferences.patch( + {"weight": ds.variables.by("alias")[self.weight].entity.self} + ) + streaming_ds = StreamingDataset(ds) + self._log(f"[{streaming_ds.id}] {name} [project={project_id}]") + return streaming_ds, ds + + def _get_var_values(self, var) -> Dict[str, Any]: + """ + Given a variable, runs a /dataset/DID/variable/VID/values/ call to get the data values + associated to it and parses them to return them. + """ + return self.site.session.get(var.views["values"]).json()["value"] + + def __new__(cls, *args, **kwargs): + for fn_name in cls.TEST_FUNCTIONS: + if hasattr(cls, fn_name): + continue + orig_fn = getattr(cls, f"_{fn_name}", None) + if not orig_fn: + continue + setattr(cls, fn_name, orig_fn) + return super().__new__(cls) + + +class BaseTestDatasets(BaseTestCase): + """ + This class instantiates all the tests we need to run. The actual execution will be + taken care of by its subclasses, each of them having different settings (i.e., Python version + and/or weight variable settings). + """ + + def _test_create_dataset(self): + ds, _ = self._create_dataset(name="test_dataset") + assert set(ds.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + + def _test_switch_dataset(self): + ds, _ = self._create_dataset(name="test_dataset") + ds = self._change_dataset_version(ds) + assert set(ds.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + + def _test_add_variable_to_dataset(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + ds = self._change_dataset_version(ds) + var1 = ds_instance.variables.by("alias")["A"].entity + ds_instance.variables.create( + as_entity( + { + "name": "der2", + "alias": "der2", + "derived": True, + "derivation": { + "function": "+", + "args": [ + {"variable": var1.self}, + {"value": 2}, + ], + }, + } + ) + ) + assert set(ds.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "der2", + "A", + "B", + "weight_var", + } + + def _test_delete_variable_from_dataset(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + ds = self._change_dataset_version(ds) + der1 = ds_instance.variables.by("alias")["der1"].entity + der1.delete() + assert set(ds.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "A", + "B", + "weight_var", + } + ds = self._revert_dataset_version(ds) + assert set(ds.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "A", + "B", + "weight_var", + } + + def _test_dataset_with_view(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + view, view_instance = self._create_view(ds_instance) + ds = self._change_dataset_version(ds) + view = self._change_dataset_version(view) + view2, view2_instance = self._create_view(ds_instance) + assert set(view.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + assert set(view2.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + view = self._revert_dataset_version(view) + view2 = self._revert_dataset_version(view2) + assert set(view.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + assert set(view2.variable_aliases()) == { + "DT", + "cat1", + "cat2", + "der1", + "A", + "B", + "weight_var", + } + + def _assert_cube_query(self, ds): + EXPECTED = numpy.array([27, 19, 23]) if self.weight else numpy.array([3, 2, 2]) + resp = crtabs(dataset=ds, variables=["cat1"], weight=self.weight) + numpy.testing.assert_array_equal(resp.counts, EXPECTED) + ds = self._change_dataset_version(ds) + resp = crtabs(dataset=ds, variables=["cat1"], weight=self.weight) + numpy.testing.assert_array_equal(resp.counts, EXPECTED) + + def _test_cube_query_on_dataset(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + return self._assert_cube_query(ds) + + def _test_cube_query_on_view(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + view, view_instance = self._create_view(ds_instance) + return self._assert_cube_query(view) + + def _test_run_script_change_var_name(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """CHANGE TITLE IN cat1 WITH "Var A";""" + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + assert ds_instance.variables.by("alias")["cat1"].name == "Var A" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + assert ds_instance.variables.by("alias")["cat1"].name == "Var A" + + def _test_run_script_replace_convert_to_numeric(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT cat1 TO NUMERIC; + """ + orig_var = ds_instance.variables.by("alias")["cat1"] + assert orig_var.get("type") == "categorical" + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["cat1"] + assert new_var.get("type") == "numeric" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["cat1"] + assert new_var.get("type") == "numeric" + + def _test_run_script_replace_convert_to_datetime(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT DT TO DATETIME FORMAT "%Y-%m-%d"; + """ + orig_var = ds_instance.variables.by("alias")["DT"] + assert orig_var.get("type") == "text" + assert self._get_var_values(orig_var.entity) == [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + {"?": -1}, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + {"?": -1}, + ] + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["DT"] + assert new_var.get("type") == "datetime" + assert self._get_var_values(new_var.entity) == [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + {"?": -1}, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + {"?": -1}, + ] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["DT"] + assert new_var.get("type") == "datetime" + assert self._get_var_values(new_var.entity) == [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + {"?": -1}, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + {"?": -1}, + ] + + def _test_run_script_replace_convert_to_categorical(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT A, B TO CATEGORICAL WITH + VALUE 1 TO "T" CODE 1, + VALUE 0 TO "F" CODE 2; + """ + orig_var = ds_instance.variables.by("alias")["A"] + orig_var_2 = ds_instance.variables.by("alias")["B"] + assert orig_var.get("type") == "numeric" + assert orig_var_2.get("type") == "numeric" + assert orig_var.get("scale") is None + assert orig_var_2.get("scale") is None + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["A"] + new_var_2 = ds_instance.variables.by("alias")["B"] + assert new_var.get("type") == "categorical" + assert new_var_2.get("type") == "categorical" + assert new_var.get("scale") == "interval" + assert new_var_2.get("scale") == "interval" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["A"] + new_var_2 = ds_instance.variables.by("alias")["B"] + assert new_var.get("type") == "categorical" + assert new_var_2.get("type") == "categorical" + assert new_var.get("scale") == "interval" + assert new_var_2.get("scale") == "interval" + + def _test_run_script_replace_convert_to_text(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT A TO TEXT; + """ + orig_var = ds_instance.variables.by("alias")["A"] + assert orig_var.get("type") == "numeric" + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["A"] + assert new_var.get("type") == "text" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["A"] + assert new_var.get("type") == "text" + + def _test_run_script_create_categorical_array(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE CATEGORICAL ARRAY cat1, cat2 AS array1; + """ + assert "array1" not in ds_instance.variables.by("alias") + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["array1"] + assert new_var.get("type") == "categorical_array" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["array1"] + assert new_var.get("type") == "categorical_array" + + def _test_run_script_create_categorical_case(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE CATEGORICAL CASE WHEN + A == 1 THEN "Cat 1" + END + AS A1; + """ + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["A1"] + assert new_var.get("type") == "categorical" + resp = self._get_var_values(new_var.entity) + assert resp == [ + 1, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + ] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["A1"] + assert new_var.get("type") == "categorical" + resp = self._get_var_values(new_var.entity) + assert resp == [ + 1, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + ] + + def _test_run_script_create_categorical_recode(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE CATEGORICAL RECODE cat1 + MAPPING + "cat 1", "cat 2" INTO "first two" CODE 1 + AS myrecode; + """ + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["myrecode"] + assert new_var.get("type") == "categorical" + assert { + o["id"]: o["name"] for o in new_var.entity.summary.value["categories"] + } == {1: "first two", 3: "cat 3", -1: "No Data"} + resp = self._get_var_values(new_var.entity) + assert resp == [1, 1, 3, {"?": -1}, {"?": -1}, {"?": -1}, 1, 1, 3, 1] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["myrecode"] + assert new_var.get("type") == "categorical" + assert { + o["id"]: o["name"] for o in new_var.entity.summary.value["categories"] + } == {1: "first two", 3: "cat 3", -1: "No Data"} + resp = self._get_var_values(new_var.entity) + assert resp == [1, 1, 3, {"?": -1}, {"?": -1}, {"?": -1}, 1, 1, 3, 1] + + def _test_run_script_create_numeric_array(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE NUMERIC ARRAY A, B AS NumArray; + """ + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["NumArray"] + assert new_var.get("type") == "numeric_array" + resp = self._get_var_values(new_var.entity) + assert resp == [ + [1.0, 2.0], + [2.0, 3.0], + [3.0, 1.0], + [{"?": -1}, 5.0], + [5.0, {"?": -1}], + [6.0, 6.0], + [7.0, 8.0], + [8.0, 11.0], + [{"?": -1}, 7.0], + [11.0, {"?": -1}], + ] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["NumArray"] + assert new_var.get("type") == "numeric_array" + resp = self._get_var_values(new_var.entity) + assert resp == [ + [1.0, 2.0], + [2.0, 3.0], + [3.0, 1.0], + [{"?": -1}, 5.0], + [5.0, {"?": -1}], + [6.0, 6.0], + [7.0, 8.0], + [8.0, 11.0], + [{"?": -1}, 7.0], + [11.0, {"?": -1}], + ] + + def _test_run_script_create_numeric(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE NUMERIC A + B AS sum; + """ + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["sum"] + assert new_var.get("type") == "numeric" + resp = self._get_var_values(new_var.entity) + assert resp == [ + 3.0, + 5.0, + 4.0, + {"?": -1}, + {"?": -1}, + 12.0, + 15.0, + 19.0, + {"?": -1}, + {"?": -1}, + ] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["sum"] + assert new_var.get("type") == "numeric" + resp = self._get_var_values(new_var.entity) + assert resp == [ + 3.0, + 5.0, + 4.0, + {"?": -1}, + {"?": -1}, + 12.0, + 15.0, + 19.0, + {"?": -1}, + {"?": -1}, + ] + + def _test_run_script_create_logical(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE LOGICAL A == 1 AS illogical; + """ + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["illogical"] + assert new_var.get("type") == "categorical" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["illogical"] + assert new_var.get("type") == "categorical" + + def _test_run_script_overwrite_numeric_values(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + OVERWRITE NUMERIC VALUES A WITH 0 WHEN cat1 = 1; + """ + new_var = ds_instance.variables.by("alias")["A"].entity + resp = self._get_var_values(new_var) + assert resp == [1.0, 2.0, 3.0, {"?": -1}, 5.0, 6.0, 7.0, 8.0, {"?": -1}, 11.0] + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + resp = self._get_var_values(new_var) + assert resp == [0, 2.0, 3.0, {"?": -1}, 5.0, 6.0, 0, 8.0, {"?": -1}, 0] + ds = self._change_dataset_version(ds) + resp = self._get_var_values(new_var) + assert resp == [0, 2.0, 3.0, {"?": -1}, 5.0, 6.0, 0, 8.0, {"?": -1}, 0] + + def _test_run_script_set_exclusion(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + SET EXCLUSION weight_var == 0; + """ + assert ds.get_exclusion() is None + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + assert ds.get_exclusion() == "weight_var == 0" + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + assert ds.get_exclusion() == "weight_var == 0" + + def _test_export_dataset(self, format_, expected): + ds, ds_instance = self._create_dataset(name="test_dataset") + exported = self._export_dataset(ds, format_) + numpy_assert_equal(exported, expected) + ds = self._change_dataset_version(ds) + exported = self._export_dataset(ds, format_) + numpy_assert_equal(exported, expected) + + def _test_export_dataset_as_csv(self): + EXPECTED = { + "A": [ + "1.0", + "2.0", + "3.0", + "No Data", + "5.0", + "6.0", + "7.0", + "8.0", + "No Data", + "11.0", + ], + "B": [ + "2.0", + "3.0", + "1.0", + "5.0", + "No Data", + "6.0", + "8.0", + "11.0", + "7.0", + "No Data", + ], + "weight_var": [ + "10.0", + "8.0", + "14.0", + "10.0", + "12.0", + "9.0", + "10.0", + "11.0", + "9.0", + "7.0", + ], + "cat1": ["1", "2", "3", "-1", "-1", "-1", "1", "2", "3", "1"], + "cat2": ["1", "3", "2", "-1", "1", "-1", "1", "2", "3", "-1"], + "DT": [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + "No Data", + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + "No Data", + ], + "der1": [ + "2.0", + "3.0", + "4.0", + "No Data", + "6.0", + "7.0", + "8.0", + "9.0", + "No Data", + "12.0", + ], + } + self._test_export_dataset("csv", EXPECTED) + + def _test_export_dataset_as_spss(self): + nan = float("nan") + EXPECTED = { + "A": [ + 1.0, + 2.0, + 3.0, + nan, + 5.0, + 6.0, + 7.0, + 8.0, + nan, + 11.0, + ], + "B": [ + 2.0, + 3.0, + 1.0, + 5.0, + nan, + 6.0, + 8.0, + 11.0, + 7.0, + nan, + ], + "weight_var": [ + 10.0, + 8.0, + 14.0, + 10.0, + 12.0, + 9.0, + 10.0, + 11.0, + 9.0, + 7.0, + ], + "cat1": [1.0, 2.0, 3.0, nan, nan, nan, 1.0, 2.0, 3.0, 1.0], + "cat2": [1.0, 3.0, 2.0, nan, 1.0, nan, 1.0, 2.0, 3.0, nan], + "der1": [ + 2.0, + 3.0, + 4.0, + nan, + 6.0, + 7.0, + 8.0, + 9.0, + nan, + 12.0, + ], + "DT": [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + "-1", + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + "-1", + ], + } + + self._test_export_dataset("spss", EXPECTED) + + def _test_import_spss_dataset(self): + # Still being implemented + pass + + def _test_import_csv_dataset(self): + imported_ds = self._import_dataset( + {"description": "Imported csv dataset"}, + "sample-1-expected.csv", + format_="csv", + ) + assert imported_ds.body.description == "Imported csv dataset" + assert set(imported_ds.variables.by("alias").keys()) == { + "Q7", + "Q99", + "Q6", + "Q2_5", + "Q2_4", + "Q4_9", + "Q2_9", + "Q4_3", + "Q2_1", + "Q4_1", + "Q3", + "Q4_5", + "Q5", + "Q4_2", + "Q2_3", + "Q2_2", + "Q4_4", + "Q1", + } + + +#: This is the list of all tests we want to support for integration purposes. +#: This list will grow by time as we implement other ones. +ALL_TEST_FUNCTIONS = [ + "test_create_dataset", + "test_switch_dataset", + "test_add_variable_to_dataset", + "test_delete_variable_from_dataset", + "test_dataset_with_view", + "test_cube_query_on_dataset", + "test_cube_query_on_view", + "test_export_dataset_as_csv", + "test_export_dataset_as_spss", + "test_import_csv_dataset", + # TODO: still being implemented + # "test_import_spss_dataset", + "test_run_script_change_var_name", + "test_run_script_replace_convert_to_categorical", + "test_run_script_replace_convert_to_datetime", + "test_run_script_replace_convert_to_numeric", + "test_run_script_replace_convert_to_text", + "test_run_script_set_exclusion", + "test_run_script_create_categorical_array", + "test_run_script_create_logical", + "test_run_script_overwrite_numeric_values", + "test_run_script_create_numeric", + "test_run_script_create_numeric_array", + "test_run_script_create_categorical_case", + "test_run_script_create_categorical_recode", +] + + +class Test36Datasets(BaseTestDatasets): + """ + Dataset tests initially running on a Python 3.6 zz9 factory, no weight variable. + """ + + CURRENT_VERSION = "3.6" + TEST_FUNCTIONS = ALL_TEST_FUNCTIONS + + +class WeightedTest36Datasets(Test36Datasets): + """ + Dataset tests initially running on a Python 3.6 zz9 factory, with weight variable. + """ + + weight = "weight_var" + + +class Test311Datasets(BaseTestDatasets): + """ + Dataset tests initially running on a Python 3.11 zz9 factory, no weight variable. + """ + + CURRENT_VERSION = "3.11" + TEST_FUNCTIONS = ALL_TEST_FUNCTIONS + + +class WeightedTest311Datasets(Test311Datasets): + """ + Dataset tests initially running on a Python 3.11 zz9 factory, with weight variable. + """ + + weight = "weight_var" From 38cfd256571f044f307ebfc91a5bf41d921de8e5 Mon Sep 17 00:00:00 2001 From: Angelo Romano Date: Fri, 24 Jan 2025 16:50:43 +0100 Subject: [PATCH 2/5] [CADL-41] Fixing bug on scrunch --- scrunch/datasets.py | 6 +++--- scrunch/helpers.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scrunch/datasets.py b/scrunch/datasets.py index 8b7805d..a3d3f5e 100644 --- a/scrunch/datasets.py +++ b/scrunch/datasets.py @@ -34,7 +34,7 @@ subvar_alias, validate_categories, shoji_catalog_wrapper, get_else_case, else_case_not_selected, SELECTED_ID, NOT_SELECTED_ID, NO_DATA_ID, valid_categorical_date, - generate_subvariable_codes) + generate_subvariable_codes, shoji_order_wrapper) from scrunch.order import DatasetVariablesOrder, ProjectDatasetsOrder from scrunch.subentity import Deck, Filter, Multitable from scrunch.variables import (combinations_from_map, combine_categories_expr, @@ -2380,7 +2380,7 @@ def fork(self, description=None, name=None, is_published=False, :returns _fork: scrunch.datasets.BaseDataset """ from scrunch.mutable_dataset import MutableDataset - + # Handling project vs owner conflict owner = kwargs.get("owner") @@ -2421,7 +2421,7 @@ def fork(self, description=None, name=None, is_published=False, ) else: # Create fork in source dataset path. - body["project"] = self.resource.body.owner + body["project"] = self.resource.body.owner else: if project: # Create fork in given Project path. diff --git a/scrunch/helpers.py b/scrunch/helpers.py index e2e8041..32dc1c2 100644 --- a/scrunch/helpers.py +++ b/scrunch/helpers.py @@ -222,6 +222,16 @@ def shoji_view_wrapper(value, **kwargs): return payload +def shoji_order_wrapper(graph, **kwargs): + """ + receives a dictionary and wraps its content on a body keyed dictionary + with the appropriate shoji: attribute + """ + payload = {"element": "shoji:order", "graph": graph} + payload.update(**kwargs) + return payload + + def shoji_entity_wrapper(body, **kwargs): """ receives a dictionary and wraps its content on a body keyed dictionary From 728243a2b16d0a712d74e278e0eab591b429a9f0 Mon Sep 17 00:00:00 2001 From: Angelo Romano Date: Mon, 27 Jan 2025 17:01:05 +0100 Subject: [PATCH 3/5] [CADL-41] Removing typing due to incompatibility with the CI --- integration/__init__.py | 0 py311/test_dataset.py | 86 ++++++++++++++++++++++------------------- pytest.ini | 2 +- setup.py | 2 +- tox.ini | 12 ++++-- 5 files changed, 57 insertions(+), 45 deletions(-) create mode 100644 integration/__init__.py diff --git a/integration/__init__.py b/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/py311/test_dataset.py b/py311/test_dataset.py index ed4e0c6..cff0111 100644 --- a/py311/test_dataset.py +++ b/py311/test_dataset.py @@ -2,13 +2,13 @@ import codecs import csv import os +import sys import tempfile import uuid from datetime import datetime -from typing import Any, Dict, Optional, Union import numpy -import pyspssio +import pytest from numpy.testing import assert_equal as numpy_assert_equal from pycrunch.shoji import Entity, as_entity, wait_progress @@ -17,6 +17,9 @@ from scrunch.datasets import Project from scrunch.streaming_dataset import StreamingDataset +IS_PYTHON_2 = bool(sys.version_info.major == 2) +if not IS_PYTHON_2: + import pyspssio PROJECT_ID = os.environ.get("SCRUNCH_PROJECT_ID") PROJECT_311_ID = os.environ.get("SCRUNCH_PROJECT_311_ID") TEST_DATASET_ID = os.environ.get("SCRUNCH_TEST_DATASET_ID") @@ -60,9 +63,7 @@ class ST: source_mimetypes["txt"] = ["text/csv"] # Backward compatibility -def ensure_binary( - s: Union[str, bytes], encoding: str = "utf-8", errors: str = "strict" -) -> bytes: +def ensure_binary(s, encoding="utf-8", errors="strict"): """Coerce **s** to bytes. - `str` -> encoded to `bytes` - `bytes` -> `bytes` @@ -75,7 +76,7 @@ def ensure_binary( return s if isinstance(s, str): return s.encode(encoding, errors) - raise TypeError(f"not expecting type '{type(s)}'") + raise TypeError("not expecting type '%s'" % type(s)) BOUNDARY = "________ThIs_Is_tHe_bouNdaRY_$" @@ -90,21 +91,23 @@ def encode_multipart_formdata(files): for key, filename, value in files: lines.append("--" + BOUNDARY) if filename is None: - lines.append(f'Content-Disposition: form-data; name="{key}"\r\n\r\n{value}') + lines.append( + 'Content-Disposition: form-data; name="%s"\r\n\r\n%s' % (key, value) + ) continue lines.append( - f'Content-Disposition: form-data; name="{key}"; filename="{filename}"' + 'Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename) ) ct = source_mimetypes.get( filename.rsplit(".", 1)[-1], ["application/octet-stream"] )[0] - lines.append(f"Content-Type: {ct}") + lines.append("Content-Type: %s" % ct) lines.append("") lines.append(value) lines.append("--" + BOUNDARY + "--") lines.append("") body = "\r\n".join(lines) - content_type = f"multipart/form-data; charset=UTF-8; boundary={BOUNDARY}" + content_type = "multipart/form-data; charset=UTF-8; boundary=%s" % BOUNDARY return content_type, body @@ -155,11 +158,11 @@ def _encode_file_as_multipart(self, field_name, filename, content_type, contents ) body += "--%s--\r\n" % BOUNDARY - content_type = f"multipart/form-data; charset=UTF-8; boundary={BOUNDARY}" + content_type = "multipart/form-data; charset=UTF-8; boundary=%s" % BOUNDARY return content_type, body - def _parse_on_311(self, on_311: Union[None, bool]) -> bool: + def _parse_on_311(self, on_311): """ Based on the value of the parameters, returns True or False, based on whether we are meant to run this on a Python 3.11 factory or not. @@ -174,10 +177,10 @@ def _parse_on_311(self, on_311: Union[None, bool]) -> bool: def _import_dataset( self, - metadata: Dict[str, Any], - input_file: str, - on_311: Optional[bool] = None, - format_: str = "csv", + metadata, + input_file, + on_311=None, + format_="csv", ): """ :param metadata: The metadata fields associated to the dataset we are creating. @@ -194,12 +197,14 @@ def _import_dataset( "Weighed imported test dataset" if self.weight else "Imported test dataset" ) ds_data = {k: v for k, v in metadata.items()} - ds_data["name"] = ( - f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ds_data["name"] = "%s %s [%s]" % ( + name, + uuid.uuid4().hex[:16], + datetime.now().isoformat(), ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = f"/projects/{project_id}/" + ds_data["project"] = "/projects/%s" % project_id # server/tests/controllers/test_sources.py # streaming dataset # steps @@ -240,7 +245,7 @@ def _import_dataset( wait_progress(resp, self.site.session) return ds.refresh() - def _export_dataset(self, ds, format_: str = "csv") -> Dict[str, Any]: + def _export_dataset(self, ds, format_="csv"): """ Runs a dataset export. @@ -251,7 +256,7 @@ def _export_dataset(self, ds, format_: str = "csv") -> Dict[str, Any]: ds.export(output.name, format=format_) return self._parse_dataset_export(output, format_) - def _run_script(self, ds, payload: dict): + def _run_script(self, ds, payload): """ Runs an automation script against a dataset. """ @@ -260,7 +265,7 @@ def _run_script(self, ds, payload: dict): wait_progress(resp, self.site.session) return ds.refresh() - def _parse_dataset_export(self, output: str, format_: str = "csv"): + def _parse_dataset_export(self, output, format_="csv"): """ Given an output file, parses it and returns the values for it. """ @@ -289,23 +294,21 @@ def tearDown(self): super().tearDown() - def _project(self, id: str) -> Project: + def _project(self, id): """ Returns the scrunch project instance for a specific project ID. """ project = Project( Entity( self.site.session, - **{ - "self": f"{self.site.self}projects/{id}/", - "element": "shoji:entity", - "body": {"name": "Target project"}, - }, + self="%sprojects/%s" % (self.site.self, id), + element="shoji:entity", + body={"name": "Target project"}, ) ) return project - def _log(self, msg: str): + def _log(self, msg): print(msg) def _change_dataset_version(self, ds): @@ -335,12 +338,14 @@ def _create_view(self, ds, on_311=None, **values): ds_data["view_of"] = ds.self if not name: name = "Weighed test view dataset" if self.weight else "Test view dataset" - ds_data["name"] = ( - f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ds_data["name"] = "%s %s [%s]" % ( + name, + uuid.uuid4().hex[:16], + datetime.now().isoformat(), ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = f"/projects/{project_id}/" + ds_data["project"] = "/projects/%s" % project_id view = self.site.datasets.create(as_entity(ds_data)).refresh() self._created_datasets[ds.self][1][view.self] = view if self.weight: @@ -348,7 +353,7 @@ def _create_view(self, ds, on_311=None, **values): {"weight": view.variables.by("alias")[self.weight].entity.self} ) streaming_view = StreamingDataset(view) - self._log(f"[{streaming_view.id}] {name} [project={project_id}]") + self._log("[%s] %s [project=%s]" % (streaming_view.id, name, project_id)) return streaming_view, view def _create_dataset(self, on_311=None, pk=None, **values): @@ -361,12 +366,14 @@ def _create_dataset(self, on_311=None, pk=None, **values): ds_data = {k: v for k, v in values.items()} if not name: name = "Weighed test dataset" if self.weight else "Test dataset" - ds_data["name"] = ( - f"{name} {uuid.uuid4().hex[:16]} [{datetime.now().isoformat()}]" + ds_data["name"] = "%s %s [%s]" % ( + name, + uuid.uuid4().hex[:16], + datetime.now().isoformat(), ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = f"/projects/{project_id}/" + ds_data["project"] = "/projects/%s" % project_id ds = self.site.datasets.create(as_entity(ds_data)).refresh() if pk: ds.variables.create( @@ -528,10 +535,10 @@ def _create_dataset(self, on_311=None, pk=None, **values): {"weight": ds.variables.by("alias")[self.weight].entity.self} ) streaming_ds = StreamingDataset(ds) - self._log(f"[{streaming_ds.id}] {name} [project={project_id}]") + self._log("[%s] %s [project=%s]" % (streaming_ds.id, name, project_id)) return streaming_ds, ds - def _get_var_values(self, var) -> Dict[str, Any]: + def _get_var_values(self, var): """ Given a variable, runs a /dataset/DID/variable/VID/values/ call to get the data values associated to it and parses them to return them. @@ -542,7 +549,7 @@ def __new__(cls, *args, **kwargs): for fn_name in cls.TEST_FUNCTIONS: if hasattr(cls, fn_name): continue - orig_fn = getattr(cls, f"_{fn_name}", None) + orig_fn = getattr(cls, "_" + fn_name, None) if not orig_fn: continue setattr(cls, fn_name, orig_fn) @@ -1117,6 +1124,7 @@ def _test_export_dataset_as_csv(self): } self._test_export_dataset("csv", EXPECTED) + @pytest.mark.skipif(IS_PYTHON_2, reason="Requires Python 3") def _test_export_dataset_as_spss(self): nan = float("nan") EXPECTED = { diff --git a/pytest.ini b/pytest.ini index 051ddd8..837f886 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] norecursedirs=dist build .tox .eggs examples -addopts=--doctest-modules -p no:sugar +addopts=--doctest-modules -p no:sugar --ignore py311 doctest_optionflags=ALLOW_UNICODE ELLIPSIS diff --git a/setup.py b/setup.py index 64e4fdd..31ffe8d 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ 'pytest-cov==2.12.1', 'mock==3.0.5', 'isodate', - ], + ] + ([] if PY2 else ["pyspssio"]), 'docs': [ # upstream 'sphinx', diff --git a/tox.ini b/tox.ini index cf71b7e..d815b11 100644 --- a/tox.ini +++ b/tox.ini @@ -14,13 +14,15 @@ python = [testenv] deps = - setuptools>=31.0.1 + setuptools>=31.0.1 + contextlib2==0.6.0 + pytest pandas: pandas # workaround for yaml/pyyaml#126 # git+https://github.com/yaml/pyyaml@master#egg=pyyaml;python_version=="3.7" git+https://github.com/Crunch-io/pycrunch#pycrunch commands = - py.test {posargs} + pytest {posargs} python setup.py checkdocs usedevelop = True @@ -29,12 +31,14 @@ extras = [testenv:pandas] deps = - setuptools>=31.0.1 + setuptools>=31.0.1 + contextlib2==0.6.0 + pytest # workaround for yaml/pyyaml#126 # git+https://github.com/yaml/pyyaml@master#egg=pyyaml;python_version=="3.7" git+https://github.com/Crunch-io/pycrunch#pycrunch commands = - py.test {posargs} + pytest {posargs} python setup.py checkdocs usedevelop = True extras = From 807373a491ac8f51bd671cab0c17b2b14610f9ce Mon Sep 17 00:00:00 2001 From: Angelo Romano Date: Wed, 29 Jan 2025 15:12:04 +0100 Subject: [PATCH 4/5] [CADL-41] One more test and changes on project handling due to introduced incompatibility --- py311/test_dataset.py | 49 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/py311/test_dataset.py b/py311/test_dataset.py index cff0111..46fed56 100644 --- a/py311/test_dataset.py +++ b/py311/test_dataset.py @@ -204,7 +204,7 @@ def _import_dataset( ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = "/projects/%s" % project_id + ds_data["project"] = "%sprojects/%s/" % (self.site.self, project_id) # server/tests/controllers/test_sources.py # streaming dataset # steps @@ -301,7 +301,7 @@ def _project(self, id): project = Project( Entity( self.site.session, - self="%sprojects/%s" % (self.site.self, id), + self="%sprojects/%s/" % (self.site.self, id), element="shoji:entity", body={"name": "Target project"}, ) @@ -345,7 +345,7 @@ def _create_view(self, ds, on_311=None, **values): ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = "/projects/%s" % project_id + ds_data["project"] = "%sprojects/%s/" % (self.site.self, project_id) view = self.site.datasets.create(as_entity(ds_data)).refresh() self._created_datasets[ds.self][1][view.self] = view if self.weight: @@ -373,7 +373,7 @@ def _create_dataset(self, on_311=None, pk=None, **values): ) project_id = PROJECT_311_ID if on_311 else PROJECT_ID if project_id: - ds_data["project"] = "/projects/%s" % project_id + ds_data["project"] = "%sprojects/%s/" % (self.site.self, project_id) ds = self.site.datasets.create(as_entity(ds_data)).refresh() if pk: ds.variables.create( @@ -704,6 +704,18 @@ def _test_cube_query_on_view(self): view, view_instance = self._create_view(ds_instance) return self._assert_cube_query(view) + def _test_run_script_rename_variable(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """RENAME A TO A1;""" + orig_var_id = ds_instance.variables.by("alias")["A"].id + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + assert ds_instance.variables.by("alias")["A1"].id == orig_var_id + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + assert ds_instance.variables.by("alias")["A1"].id == orig_var_id + def _test_run_script_change_var_name(self): ds, ds_instance = self._create_dataset(name="test_dataset") body = """CHANGE TITLE IN cat1 WITH "Var A";""" @@ -1195,8 +1207,32 @@ def _test_export_dataset_as_spss(self): self._test_export_dataset("spss", EXPECTED) def _test_import_spss_dataset(self): - # Still being implemented - pass + imported_ds = self._import_dataset( + {"description": "Imported csv dataset"}, + "all_pets.sav", + format_="spss", + ) + assert imported_ds.body.description == "Imported spss dataset" + assert set(imported_ds.variables.by("alias").keys()) == { + "Q7", + "Q99", + "Q6", + "Q2_5", + "Q2_4", + "Q4_9", + "Q2_9", + "Q4_3", + "Q2_1", + "Q4_1", + "Q3", + "Q4_5", + "Q5", + "Q4_2", + "Q2_3", + "Q2_2", + "Q4_4", + "Q1", + } def _test_import_csv_dataset(self): imported_ds = self._import_dataset( @@ -1255,6 +1291,7 @@ def _test_import_csv_dataset(self): "test_run_script_create_numeric_array", "test_run_script_create_categorical_case", "test_run_script_create_categorical_recode", + "test_run_script_rename_variable", ] From fad943e7c7348f310d19b2bce9a2e72d4cdff559 Mon Sep 17 00:00:00 2001 From: Angelo Romano Date: Fri, 31 Jan 2025 17:38:09 +0100 Subject: [PATCH 5/5] [CADL-41] More testing --- py311/test_dataset.py | 595 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 568 insertions(+), 27 deletions(-) diff --git a/py311/test_dataset.py b/py311/test_dataset.py index 46fed56..6555220 100644 --- a/py311/test_dataset.py +++ b/py311/test_dataset.py @@ -8,6 +8,7 @@ from datetime import datetime import numpy +import pycrunch import pytest from numpy.testing import assert_equal as numpy_assert_equal from pycrunch.shoji import Entity, as_entity, wait_progress @@ -225,24 +226,25 @@ def _import_dataset( contents=self._test_file_bytes(input_fullpath), ) - poster = self.site.sources.post - content = body - resp = poster( - content, - headers={"Content-Type": content_type, "Content-Length": str(len(body))}, - ) ds = self.site.datasets.create(as_entity(ds_data)).refresh() + importer = pycrunch.importing.Importer() + fixture_filepath = os.path.join( + os.path.dirname(__file__), "fixture_files", input_file + ) + with open(fixture_filepath, "rb" if format_ == "spss" else "rt") as fp: + resp = importer.add_source(ds, input_file, fp, source_mimetypes[format_][0]) resp = ds.batches.post( { "element": "shoji:entity", "body": { - "source": resp.headers["Location"], + "source": resp, "workflow": [], }, "savepoint": False, } ) # .json()["value"] wait_progress(resp, self.site.session) + return ds.refresh() def _export_dataset(self, ds, format_="csv"): @@ -416,6 +418,161 @@ def _create_dataset(self, on_311=None, pk=None, **values): } ) ).refresh() + ds.variables.create( + as_entity( + { + "name": "C1", + "alias": "C1", + "type": "text", + "values": ["13", "23", None, "5", None, "5", "10", "11", "4", "2"], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "C2", + "alias": "C2", + "type": "text", + "values": ["20", "13", "4", None, "5", None, "9", "12", None, "1"], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "MR1", + "alias": "MR1", + "type": "categorical", + "categories": [ + { + "id": 1, + "name": "CAT1", + "missing": False, + "numeric_value": 1, + }, + { + "id": 2, + "name": "CAT2", + "missing": False, + "numeric_value": 2, + }, + { + "id": 3, + "name": "CAT3", + "missing": False, + "numeric_value": 3, + }, + { + "id": -1, + "name": "No Data", + "missing": True, + "numeric_value": None, + }, + ], + "values": [-1, 1, 3, 2, 1, 2, -1, -1, 3, 3], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "MR2", + "alias": "MR2", + "type": "categorical", + "categories": [ + { + "id": 1, + "name": "CAT1", + "missing": False, + "numeric_value": 1, + }, + { + "id": 2, + "name": "CAT2", + "missing": False, + "numeric_value": 2, + }, + { + "id": 3, + "name": "CAT3", + "missing": False, + "numeric_value": 3, + }, + { + "id": 4, + "name": "CAT4", + "missing": False, + "numeric_value": 4, + }, + { + "id": -1, + "name": "No Data", + "missing": True, + "numeric_value": None, + }, + ], + "values": [1, -1, 3, 4, 2, 1, 3, 4, -1, 3], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "N1", + "alias": "N1", + "type": "numeric", + "values": [16, 26, 5, None, None, 5, 20, 11, 7, 2], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "N2", + "alias": "N2", + "type": "numeric", + "values": [10, 29, 4, None, 5, None, None, 22, 2, 9], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "M", + "type": "categorical_array", + "subvariables": [ + ds.variables.by("alias")["C1"].entity_url, + ds.variables.by("alias")["C2"].entity_url, + ], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "NA", + "type": "numeric_array", + "subvariables": [ + ds.variables.by("alias")["N1"].entity_url, + ds.variables.by("alias")["N2"].entity_url, + ], + } + ) + ).refresh() + ds.variables.create( + as_entity( + { + "name": "MR", + "type": "multiple_response", + "subvariables": [ + ds.variables.by("alias")["MR1"].entity_url, + ds.variables.by("alias")["MR2"].entity_url, + ], + "selected_categories": ["CAT1", "CAT2"], + } + ) + ).refresh() ds.variables.create( as_entity( { @@ -538,6 +695,31 @@ def _create_dataset(self, on_311=None, pk=None, **values): self._log("[%s] %s [project=%s]" % (streaming_ds.id, name, project_id)) return streaming_ds, ds + def assert_dataset_values( + self, ds, expected, fields=None, include=None, exclude=None + ): + if include or exclude: + _expected = expected.copy() + if include: + _expected.update(include) + if exclude: + for key in exclude: + del _expected[key] + else: + _expected = expected + assert _expected == self._get_dataset_values(ds, fields) + + def _get_dataset_values(self, ds, fields=None): + variables = { + k: o.entity + for k, o in ds.variables.by("alias").items() + if not fields or k in fields + } + resp = {} + for alias, var in variables.items(): + resp[alias] = self._get_var_values(var) + return resp + def _get_var_values(self, var): """ Given a variable, runs a /dataset/DID/variable/VID/values/ call to get the data values @@ -556,6 +738,64 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) +DATASET_EXPECTED = { + "weight_var": [10.0, 8.0, 14.0, 10.0, 12.0, 9.0, 10.0, 11.0, 9.0, 7.0], + "A": [1.0, 2.0, 3.0, {"?": -1}, 5.0, 6.0, 7.0, 8.0, {"?": -1}, 11.0], + "B": [2.0, 3.0, 1.0, 5.0, {"?": -1}, 6.0, 8.0, 11.0, 7.0, {"?": -1}], + "M": [ + [3, 10], + [5, 3], + [{"?": -1}, 6], + [7, {"?": -1}], + [{"?": -1}, 7], + [7, {"?": -1}], + [1, 11], + [2, 9], + [6, {"?": -1}], + [4, 8], + ], + "NA": [ + [16.0, 10.0], + [26.0, 29.0], + [5.0, 4.0], + [{"?": -1}, {"?": -1}], + [{"?": -1}, 5.0], + [5.0, {"?": -1}], + [20.0, {"?": -1}], + [11.0, 22.0], + [7.0, 2.0], + [2.0, 9.0], + ], + "MR": [ + [{"?": -1}, 1], + [1, {"?": -1}], + [3, 3], + [2, 4], + [1, 2], + [2, 1], + [{"?": -1}, 3], + [{"?": -1}, 4], + [3, {"?": -1}], + [3, 3], + ], + "DT": [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + {"?": -1}, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + {"?": -1}, + ], + "cat1": [1, 2, 3, {"?": -1}, {"?": -1}, {"?": -1}, 1, 2, 3, 1], + "cat2": [1, 3, 2, {"?": -1}, 1, {"?": -1}, 1, 2, 3, {"?": -1}], + "der1": [2.0, 3.0, 4.0, {"?": -1}, 6.0, 7.0, 8.0, 9.0, {"?": -1}, 12.0], +} + + class BaseTestDatasets(BaseTestCase): """ This class instantiates all the tests we need to run. The actual execution will be @@ -572,6 +812,9 @@ def _test_create_dataset(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } @@ -585,6 +828,9 @@ def _test_switch_dataset(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } @@ -616,6 +862,9 @@ def _test_add_variable_to_dataset(self): "der2", "A", "B", + "M", + "NA", + "MR", "weight_var", } @@ -630,6 +879,9 @@ def _test_delete_variable_from_dataset(self): "cat2", "A", "B", + "M", + "NA", + "MR", "weight_var", } ds = self._revert_dataset_version(ds) @@ -639,6 +891,9 @@ def _test_delete_variable_from_dataset(self): "cat2", "A", "B", + "M", + "NA", + "MR", "weight_var", } @@ -655,6 +910,9 @@ def _test_dataset_with_view(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } assert set(view2.variable_aliases()) == { @@ -664,6 +922,9 @@ def _test_dataset_with_view(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } view = self._revert_dataset_version(view) @@ -675,6 +936,9 @@ def _test_dataset_with_view(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } assert set(view2.variable_aliases()) == { @@ -684,6 +948,9 @@ def _test_dataset_with_view(self): "der1", "A", "B", + "M", + "NA", + "MR", "weight_var", } @@ -727,6 +994,101 @@ def _test_run_script_change_var_name(self): ds_instance = ds_instance.refresh() assert ds_instance.variables.by("alias")["cat1"].name == "Var A" + def _test_run_script_materialize_derived(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + MATERIALIZE VARIABLES der1; + """ + orig_var = ds_instance.variables.by("alias")["der1"] + assert orig_var.entity.body.derived is True + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["der1"] + assert new_var.entity.body.derived is False + self.assert_dataset_values(ds_instance, DATASET_EXPECTED) + # Version change + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["der1"] + assert new_var.entity.body.derived is False + self.assert_dataset_values(ds_instance, DATASET_EXPECTED) + + def _test_run_script_create_dichotomy(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + CREATE MULTIPLE DICHOTOMY FROM M SELECTED "10" AS NewMR1; + """ + orig_catarr_var = ds_instance.variables.by("alias")["M"] + assert orig_catarr_var.get("type") == "categorical_array" + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + include_data = { + "NewMR1": [ + [ + 0, + 0, + ], + [ + 0, + 0, + ], + [ + { + "?": -1, + }, + 0, + ], + [ + 0, + { + "?": -1, + }, + ], + [ + { + "?": -1, + }, + 0, + ], + [ + 0, + { + "?": -1, + }, + ], + [ + 1, + 0, + ], + [ + 0, + 0, + ], + [ + 0, + { + "?": -1, + }, + ], + [ + 0, + 0, + ], + ], + } + + new_catarr_var = ds_instance.variables.by("alias")["M"] + assert new_catarr_var.get("type") == "categorical_array" + new_mr_var = ds_instance.variables.by("alias")["NewMR1"] + assert new_mr_var.get("type") == "multiple_response" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) + # Version change + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) + def _test_run_script_replace_convert_to_numeric(self): ds, ds_instance = self._create_dataset(name="test_dataset") body = """ @@ -739,10 +1101,120 @@ def _test_run_script_replace_convert_to_numeric(self): ) new_var = ds_instance.variables.by("alias")["cat1"] assert new_var.get("type") == "numeric" + include_data = { + "cat1": [ + {"?": 1}, + {"?": 1}, + {"?": 1}, + {"?": -1}, + {"?": -1}, + {"?": -1}, + {"?": 1}, + {"?": 1}, + {"?": 1}, + {"?": 1}, + ] + } + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) ds = self._change_dataset_version(ds) ds_instance = ds_instance.refresh() new_var = ds_instance.variables.by("alias")["cat1"] assert new_var.get("type") == "numeric" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) + + def _test_run_script_replace_convert_to_numeric_array(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT M TO NUMERIC; + """ + orig_var = ds_instance.variables.by("alias")["M"] + assert orig_var.get("type") == "categorical_array" + subvars = orig_var.entity.subvariables.by("alias") + assert subvars["C1"].type == "categorical" + assert subvars["C2"].type == "categorical" + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["M"] + subvars = orig_var.entity.subvariables.by("alias") + assert new_var.get("type") == "numeric_array" + assert subvars["C1"].type == "numeric" + assert subvars["C2"].type == "numeric" + include_data = { + "M": [ + [{"?": 1}, {"?": 1}], + [{"?": 1}, {"?": 1}], + [{"?": -1}, {"?": 1}], + [{"?": 1}, {"?": -1}], + [{"?": -1}, {"?": 1}], + [{"?": 1}, {"?": -1}], + [{"?": 1}, {"?": 1}], + [{"?": 1}, {"?": 1}], + [{"?": 1}, {"?": -1}], + [{"?": 1}, {"?": 1}], + ] + } + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["M"] + assert new_var.get("type") == "numeric_array" + assert subvars["C1"].type == "numeric" + assert subvars["C2"].type == "numeric" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) + + def _test_run_script_replace_convert_to_datetime_with_resolution(self): + ds, ds_instance = self._create_dataset(name="test_dataset") + body = """ + REPLACE CONVERT DT TO DATETIME FORMAT "%Y-%m-%d" RESOLUTION "M"; + """ + orig_var = ds_instance.variables.by("alias")["DT"] + assert orig_var.get("type") == "text" + assert self._get_var_values(orig_var.entity) == [ + "2024-10-02", + "2024-10-03", + "2024-10-01", + "2024-10-05", + {"?": -1}, + "2024-09-06", + "2024-10-08", + "2024-11-11", + "2024-10-07", + {"?": -1}, + ] + ds_instance = self._run_script( + ds_instance, as_entity({"body": body, "async": False}) + ) + new_var = ds_instance.variables.by("alias")["DT"] + assert new_var.get("type") == "datetime" + assert self._get_var_values(new_var.entity) == [ + "2024-10", + "2024-10", + "2024-10", + "2024-10", + {"?": -1}, + "2024-09", + "2024-10", + "2024-11", + "2024-10", + {"?": -1}, + ] + ds = self._change_dataset_version(ds) + ds_instance = ds_instance.refresh() + new_var = ds_instance.variables.by("alias")["DT"] + assert new_var.get("type") == "datetime" + assert self._get_var_values(new_var.entity) == [ + "2024-10", + "2024-10", + "2024-10", + "2024-10", + {"?": -1}, + "2024-09", + "2024-10", + "2024-11", + "2024-10", + {"?": -1}, + ] def _test_run_script_replace_convert_to_datetime(self): ds, ds_instance = self._create_dataset(name="test_dataset") @@ -819,6 +1291,11 @@ def _test_run_script_replace_convert_to_categorical(self): assert new_var_2.get("type") == "categorical" assert new_var.get("scale") == "interval" assert new_var_2.get("scale") == "interval" + include_data = { + "A": [1, 3, 4, {"?": -1}, 5, 6, 7, 8, {"?": -1}, 9], + "B": [3, 4, 1, 5, {"?": -1}, 6, 7, 8, 9, {"?": -1}], + } + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) ds = self._change_dataset_version(ds) ds_instance = ds_instance.refresh() new_var = ds_instance.variables.by("alias")["A"] @@ -827,6 +1304,7 @@ def _test_run_script_replace_convert_to_categorical(self): assert new_var_2.get("type") == "categorical" assert new_var.get("scale") == "interval" assert new_var_2.get("scale") == "interval" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) def _test_run_script_replace_convert_to_text(self): ds, ds_instance = self._create_dataset(name="test_dataset") @@ -840,10 +1318,26 @@ def _test_run_script_replace_convert_to_text(self): ) new_var = ds_instance.variables.by("alias")["A"] assert new_var.get("type") == "text" + expected = { + "A": [ + "1.0", + "2.0", + "3.0", + {"?": -1}, + "5.0", + "6.0", + "7.0", + "8.0", + {"?": -1}, + "11.0", + ] + } + self.assert_dataset_values(ds_instance, expected, fields="A") ds = self._change_dataset_version(ds) ds_instance = ds_instance.refresh() new_var = ds_instance.variables.by("alias")["A"] assert new_var.get("type") == "text" + self.assert_dataset_values(ds_instance, expected, fields="A") def _test_run_script_create_categorical_array(self): ds, ds_instance = self._create_dataset(name="test_dataset") @@ -856,10 +1350,26 @@ def _test_run_script_create_categorical_array(self): ) new_var = ds_instance.variables.by("alias")["array1"] assert new_var.get("type") == "categorical_array" + include_data = { + "array1": [ + [1, 4], + [2, 6], + [3, 5], + [{"?": -1}, {"?": -1}], + [{"?": -1}, 4], + [{"?": -1}, {"?": -1}], + [1, 4], + [2, 5], + [3, 6], + [1, {"?": -1}], + ] + } + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) ds = self._change_dataset_version(ds) ds_instance = ds_instance.refresh() new_var = ds_instance.variables.by("alias")["array1"] assert new_var.get("type") == "categorical_array" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) def _test_run_script_create_categorical_case(self): ds, ds_instance = self._create_dataset(name="test_dataset") @@ -1025,10 +1535,13 @@ def _test_run_script_create_logical(self): ) new_var = ds_instance.variables.by("alias")["illogical"] assert new_var.get("type") == "categorical" + include_data = {"illogical": [1, 0, 0, {"?": -1}, 0, 0, 0, 0, {"?": -1}, 0]} + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) ds = self._change_dataset_version(ds) ds_instance = ds_instance.refresh() new_var = ds_instance.variables.by("alias")["illogical"] assert new_var.get("type") == "categorical" + self.assert_dataset_values(ds_instance, DATASET_EXPECTED, include=include_data) def _test_run_script_overwrite_numeric_values(self): ds, ds_instance = self._create_dataset(name="test_dataset") @@ -1133,6 +1646,34 @@ def _test_export_dataset_as_csv(self): "No Data", "12.0", ], + "C1": ["3", "5", "-1", "7", "-1", "7", "1", "2", "6", "4"], + "C2": ["10", "3", "6", "-1", "7", "-1", "11", "9", "-1", "8"], + "MR1": ["-1", "1", "3", "2", "1", "2", "-1", "-1", "3", "3"], + "MR2": ["1", "-1", "3", "4", "2", "1", "3", "4", "-1", "3"], + "N1": [ + "16.0", + "26.0", + "5.0", + "No Data", + "No Data", + "5.0", + "20.0", + "11.0", + "7.0", + "2.0", + ], + "N2": [ + "10.0", + "29.0", + "4.0", + "No Data", + "5.0", + "No Data", + "No Data", + "22.0", + "2.0", + "9.0", + ], } self._test_export_dataset("csv", EXPECTED) @@ -1202,36 +1743,32 @@ def _test_export_dataset_as_spss(self): "2024-10-07", "-1", ], + "C1": [3.0, 5.0, nan, 7.0, nan, 7.0, 1.0, 2.0, 6.0, 4.0], + "C2": [10.0, 3.0, 6.0, nan, 7.0, nan, 11.0, 9.0, nan, 8.0], + "MR1": [nan, 1.0, 3.0, 2.0, 1.0, 2.0, nan, nan, 3.0, 3.0], + "MR2": [1.0, nan, 3.0, 4.0, 2.0, 1.0, 3.0, 4.0, nan, 3.0], + "N1": [16.0, 26.0, 5.0, nan, nan, 5.0, 20.0, 11.0, 7.0, 2.0], + "N2": [10.0, 29.0, 4.0, nan, 5.0, nan, nan, 22.0, 2.0, 9.0], } self._test_export_dataset("spss", EXPECTED) def _test_import_spss_dataset(self): imported_ds = self._import_dataset( - {"description": "Imported csv dataset"}, + {"description": "Imported spss dataset"}, "all_pets.sav", format_="spss", ) assert imported_ds.body.description == "Imported spss dataset" assert set(imported_ds.variables.by("alias").keys()) == { - "Q7", - "Q99", - "Q6", - "Q2_5", - "Q2_4", - "Q4_9", - "Q2_9", - "Q4_3", - "Q2_1", - "Q4_1", - "Q3", - "Q4_5", - "Q5", - "Q4_2", - "Q2_3", - "Q2_2", - "Q4_4", - "Q1", + "favorite_pet_name", + "petloc_home", + "petloc_work", + "dogs_with_papers", + "dogs_without_papers", + "wave", + "caseid", + "allpets", } def _test_import_csv_dataset(self): @@ -1277,10 +1814,12 @@ def _test_import_csv_dataset(self): "test_export_dataset_as_spss", "test_import_csv_dataset", # TODO: still being implemented - # "test_import_spss_dataset", + "test_import_spss_dataset", "test_run_script_change_var_name", + "test_run_script_replace_convert_to_numeric_array", "test_run_script_replace_convert_to_categorical", "test_run_script_replace_convert_to_datetime", + "test_run_script_replace_convert_to_datetime_with_resolution", "test_run_script_replace_convert_to_numeric", "test_run_script_replace_convert_to_text", "test_run_script_set_exclusion", @@ -1292,6 +1831,8 @@ def _test_import_csv_dataset(self): "test_run_script_create_categorical_case", "test_run_script_create_categorical_recode", "test_run_script_rename_variable", + "test_run_script_create_dichotomy", + "test_run_script_materialize_derived", ]