Skip to content

Commit

Permalink
Add new datasets (#434)
Browse files Browse the repository at this point in the history
* DRAFT: new datasets, analysis

* data: add palmerspenguins data

* restore analysis doc

* fix: siuba data getattr should use AttributeError

* feat(data): add band_members and related data

* docs: fix siuba data readme

* ci: update bigquery test versions
  • Loading branch information
machow authored Aug 31, 2022
1 parent 703d678 commit ae6dfe4
Show file tree
Hide file tree
Showing 12 changed files with 86 additions and 67 deletions.
15 changes: 1 addition & 14 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,6 @@ jobs:
SB_TEST_PGPORT: 5433
PYTEST_FLAGS: ${{ matrix.pytest_flags }}

# optional step for running bigquery tests ----
- name: Set up Cloud SDK
if: ${{(contains(github.ref, 'bigquery') || contains(github.ref, 'refs/tags')) && matrix.latest}}
uses: google-github-actions/setup-gcloud@v0
with:
project_id: siuba-tests
service_account_key: ${{ secrets.GCP_SA_KEY }}
export_default_credentials: true
- name: Test bigquery
if: ${{(contains(github.ref, 'bigquery') || contains(github.ref, 'refs/tags')) && matrix.latest}}
run: |
pip install git+https://github.com/googleapis/python-bigquery-sqlalchemy.git pandas-gbq==0.15.0
test-bigquery:
name: "Test BigQuery"
runs-on: ubuntu-latest
Expand All @@ -91,7 +78,7 @@ jobs:
python -m pip install -r requirements.txt
python -m pip install -r requirements-test.txt
python -m pip install pytest-parallel
python -m pip install sqlalchemy-bigquery==1.3.0 pandas-gbq==0.15.0
python -m pip install sqlalchemy-bigquery==1.4 pandas-gbq==0.17
python -m pip install .
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v0
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
include siuba/data/*.csv
include siuba/data/*.csv.gz
include siuba/spec/series.yml
14 changes: 14 additions & 0 deletions siuba/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# siuba datasets

| name | R package |
| ---- | --------- |
| mtcars | [datasets] |
| penguins | [palmerspenguins] |
| penguins_raw | [palmerspenguins] |
| bandmembers | [dplyr] |
| bandinstruments | [dplyr] |
| bandinstruments2 | [dplyr] |

[datasets]: https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html
[dplyr]: https://dplyr.tidyverse.org/
[palmerspenguins]: https://github.com/allisonhorst/palmerpenguins/
68 changes: 48 additions & 20 deletions siuba/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,57 @@
import pandas as pd
import pkg_resources
__all__ = [
"mtcars",
"cars",
"penguins",
"penguins_raw",
"cars_sql",
"band_members",
"band_instruments",
"band_instruments2",
]

# mtcars ----------------------------------------------------------------------
_fname = pkg_resources.resource_filename("siuba.data", "mtcars.csv")
def __dir__():
return __all__

mtcars = pd.read_csv(_fname)
mtcars.__doc__ = """
mtcars data.

Source: Henderson and Velleman (1981), Building multiple regression models interactively. Biometrics, 37, 391–411.
def _load_data_csv_gz(name):
import pandas as pd
import pkg_resources

--- Original DataFrame docs below ---
""" + mtcars.__doc__
fname = pkg_resources.resource_filename("siuba.data", f"{name}.csv.gz")
return pd.read_csv(fname)


# cars ------------------------------------------------------------------------
cars = mtcars[["cyl", "mpg", "hp"]]
def _load_data_csv(name):
import pandas as pd
import pkg_resources

fname = pkg_resources.resource_filename("siuba.data", f"{name}.csv")
return pd.read_csv(fname)

# cars_sql --------------------------------------------------------------------
import siuba.sql.utils as _sql_utils
from siuba.sql import LazyTbl as _LazyTbl
cars_sql = _LazyTbl(
_sql_utils.mock_sqlalchemy_engine("postgresql"),
"cars",
["cyl", "mpg", "hp"]
)

def _load_data_cars_sql():
import siuba.sql.utils as _sql_utils
from siuba.sql import LazyTbl as _LazyTbl
cars_sql = _LazyTbl(
_sql_utils.mock_sqlalchemy_engine("postgresql"),
"cars",
["cyl", "mpg", "hp"]
)


def __getattr__(name):
if name not in __all__:
raise AttributeError(f"No dataset named: {name}")

if name == "cars":
return _load_data_csv_gz("mtcars")[["cyl", "mpg", "hp"]]

elif name == "cars_sql":
return _load_data_cars_sql()

elif name in {"band_members", "band_instruments", "band_instruments2"}:
return _load_data_csv(name)

return _load_data_csv_gz(name)

# cars_sql --------------------------------------------------------------------
4 changes: 4 additions & 0 deletions siuba/data/band_instruments.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,plays
John,guitar
Paul,bass
Keith,guitar
4 changes: 4 additions & 0 deletions siuba/data/band_instruments2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
artist,plays
John,guitar
Paul,bass
Keith,guitar
4 changes: 4 additions & 0 deletions siuba/data/band_members.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,band
Mick,Stones
John,Beatles
Paul,Beatles
33 changes: 0 additions & 33 deletions siuba/data/mtcars.csv

This file was deleted.

Binary file added siuba/data/mtcars.csv.gz
Binary file not shown.
Binary file added siuba/data/penguins.csv.gz
Binary file not shown.
Binary file added siuba/data/penguins_raw.csv.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions siuba/tests/test_data_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import importlib

def test_data_imports():
import siuba.data
from siuba.data import __all__

# note that we can't do import * inside a function, so programmatically fetch
# each dataset
for entry in __all__:
getattr(siuba.data, entry)

0 comments on commit ae6dfe4

Please sign in to comment.