Skip to content

Commit

Permalink
Merge pull request #1 from yehoshuadimarsky/dev
Browse files Browse the repository at this point in the history
Automation, bug fixes
  • Loading branch information
yehoshuadimarsky authored Aug 7, 2019
2 parents bfe3883 + 3e1543d commit 5c35f22
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 48 deletions.
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,21 @@ Out[7]:
- python >= 3.6
- pandas

## Benchmarks
_# TODO_

## Installation
You can download and install this package from PyPI

```
pip install bcpandas
```

or from conda
```
conda install -c conda-forge bcpandas
```

## Motivations and Design
### Overview
Reading and writing data from pandas DataFrames to/from a SQL database is very slow using the built-in `read_sql` and `to_sql` methods, even with the newly introduced `execute_many` option. For Microsoft SQL Server, a far far faster method is to use the BCP utility provided by Microsoft. This utility is a command line tool that transfers data to/from the database and flat text files.
Expand Down Expand Up @@ -126,21 +141,6 @@ Currently, this is being built with only Windows in mind. Linux support is defin

Finally, the SQL Server databases supported are both the on-prem and Azure versions.

## Benchmarks
_# TODO_

## Installation
You can download and install this package from PyPI

```
pip install bcpandas
```

or from conda
```
conda install -c conda-forge bcpandas
```

## Contributing
Please, all contributions are very welcome!

Expand Down
6 changes: 4 additions & 2 deletions bcpandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@


# BCP check
cmds = [['bcp','-v'], ['sqlcmd','-?']]
cmds = [["bcp", "-v"], ["sqlcmd", "-?"]]
for cmd in cmds:
try:
subprocess.run(cmd)
except FileNotFoundError:
warnings.warn(f"{cmd[0].upper()} utility not installed or not found in PATH, bcpandas will not work!")
warnings.warn(
f"{cmd[0].upper()} utility not installed or not found in PATH, bcpandas will not work!"
)

del subprocess, warnings, cmd
56 changes: 42 additions & 14 deletions bcpandas/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
IN,
NEWLINE,
OUT,
QUERYOUT,
QUOTECHAR,
SQL_TYPES,
TABLE,
VIEW,
QUERY,
)
from .utils import _get_sql_create_statement, bcp, build_format_file, get_temp_file, sqlcmd

Expand Down Expand Up @@ -58,6 +60,41 @@ def __init__(self, server, database, username=None, password=None):
self.with_krb_auth = True
logger.info(f"Created creds:\t{self}")

@classmethod
def from_quantity(cls, engine):
"""
Alternate constructor from a `sqlalchemy.engine.base.Engine` object.
Alternate constructor, from a `sqlalchemy.engine.base.Engine` that uses `pyodbc` as the DBAPI
(which is the SQLAlchemy default for MS SQL) and using an exact PyODBC connection string (not DSN or hostname).
See https://docs.sqlalchemy.org/en/13/dialects/mssql.html#connecting-to-pyodbc for more.
Parameters
----------
engine : `sqlalchemy.engine.base.Engine`
The SQLAlchemy engine object, configured as described above
Returns
-------
`bcpandas.SqlCreds`
"""
try:
# get the odbc url part from the engine, split by ';' delimiter
conn_url = engine.url.query["odbc_connect"].split(";")
# convert into dict
conn_dict = {x.split("=")[0]: x.split("=")[1] for x in conn_url if "=" in x}

return cls(
server=conn_dict["Server"].replace("tcp:", "").replace(",1433", ""),
database=conn_dict["Database"],
username=conn_dict["UID"],
password=conn_dict["PWD"],
)
except (KeyError, AttributeError):
raise ValueError(
"The supplied 'engine' object could not be parsed correctly, try creating a SqlCreds object manually."
)

def __repr__(self):
# adopted from https://github.com/erdewit/ib_insync/blob/master/ib_insync/objects.py#L51
clsName = self.__class__.__qualname__
Expand Down Expand Up @@ -184,15 +221,7 @@ def to_sql(
)


def read_sql(
table_name,
creds,
sql_type="table",
schema="dbo",
mssql_odbc_driver_version=17,
batch_size=None,
debug=False
):
def read_sql(table_name, creds, sql_type="table", schema="dbo", batch_size=None, debug=False):
"""
Reads a SQL table, view, or query into a pandas DataFrame.
Expand All @@ -206,8 +235,6 @@ def read_sql(
The type of SQL object that the parameter `table_name` is.
schema : str, default 'dbo'
The SQL schema of the table or view. If a query, will be ignored.
mssql_odbc_driver_version : int, default 17
The installed version of the Microsoft ODBC Driver.
batch_size : int, optional
Rows will be read in batches of this size at a time. By default,
all rows will be read at once.
Expand All @@ -226,7 +253,6 @@ def read_sql(
"""
# check params
assert sql_type in SQL_TYPES
assert mssql_odbc_driver_version in {13, 17}, "SQL Server ODBC Driver must be either 13 or 17"

# set up objects
if ";" in table_name:
Expand All @@ -250,15 +276,17 @@ def read_sql(
try:
bcp(
sql_item=table_name,
direction=OUT,
direction=QUERYOUT if sql_type == QUERY else OUT,
flat_file=file_path,
creds=creds,
sql_type=sql_type,
schema=schema,
batch_size=batch_size,
)
logger.debug(f"Saved dataframe to temp CSV file at {file_path}")
return pd.read_csv(filepath_or_buffer=file_path, header=None, names=cols, index_col=False)
return pd.read_csv(
filepath_or_buffer=file_path, sep=DELIMITER, header=None, names=cols, index_col=False
)
finally:
if not debug:
logger.debug(f"Deleting temp CSV file")
Expand Down
2 changes: 1 addition & 1 deletion bcpandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def bcp(
elif direc in (OUT, QUERYOUT):
bcp_command += [
"-c", # marking as character data, not Unicode (maybe make as param?)
"-t,", # marking the delimiter as a comma (maybe also make as param?)
f"-t{DELIMITER}", # marking the delimiter as a comma (maybe also make as param?)
]

# execute
Expand Down
13 changes: 13 additions & 0 deletions dist.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"GH_user": "yehoshuadimarsky",
"name": "bcpandas",
"version": "0.1.3",
"GH_release_name": "v0.1.3 Release",
"GH_release_message": "Working to automate releases to GitHub, PyPI, and Conda-Forge.\n Fixed bug in `read_sql` that didn't set QUERYOUT when reading from a query.\n Added constructor to SqlCreds to create from SQLAlchemy.",
"short_description": "High-level wrapper around BCP for high performance data transfers between pandas and SQL Server. No knowledge of BCP required!!",
"author": "Josh Dimarsky",
"dependencies": [
"python >=3.6",
"pandas >=0.22"
]
}
51 changes: 51 additions & 0 deletions dist.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

# get configs and auth
Write-Host "Getting Configs..."
$config = Get-Content -Raw -Path "./dist.json" | ConvertFrom-Json
$auth = Get-Content -Raw -Path "./creds.json" | ConvertFrom-Json

$url = "https://github.com/$($config.GH_user)/$($config.name)/archive/$($config.version).tar.gz"
$dest = "$ENV:USERPROFILE\Downloads\$($config.name)-$(Split-Path $url -Leaf)"
$condaEnv = "dist-env"


# Setup conda
conda create -n $condaEnv python -y
conda install -n $condaEnv -c conda-forge pygithub twine click jinja2 -y
# get path to env python
$envpath = ((conda info -e) -match $condaEnv ).Split(" ")[-1]


# deploy to GitHub
Start-Process "$envpath\python.exe" -ArgumentList ".\dist.py github-release" -NoNewWindow -Wait


# PyPI
if (Test-Path "./dist") { Remove-Item "./dist" -Recurse; }
python .\setup.py sdist bdist_wheel

# add --repository-url https://test.pypi.org/legacy/ if to test.pypi.org
Start-Process "$envpath\python.exe" -ArgumentList "-m twine upload --verbose -u $($auth.pypi_username) -p $($auth.pypi_password) dist/*" -NoNewWindow -Wait


# conda
# get sha256 of GitHub tar.gz
Write-Host "Downloading $($config.name) from $url"
(New-Object System.Net.WebClient).DownloadFile($url, $dest)
if (!(Test-Path $dest)) {
Write-Host "Error, $($config.name) not found in $dest" -ForegroundColor Red
} else {
Write-Host "$($config.name) downloaded successfully to $dest" -ForegroundColor Green
}
$hash = (certutil -hashfile $dest sha256 )[1] # returns 3 rows, 2nd is hash

# render meta.yaml
Start-Process "$envpath\python.exe" -ArgumentList ".\dist.py render-conda --sha256 $hash" -NoNewWindow -Wait


# TODO upload meta.yaml to conda-forge feedstock, create PR

# cleanup
Remove-Item -Path $dest
conda remove -n $condaEnv --all -y
Remove-Item -Path $envpath -Recurse
72 changes: 72 additions & 0 deletions dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 6 22:05:04 2019
@author: JoshDimarsky
"""

import json

import click
from jinja2 import Environment, FileSystemLoader
from github import Github


@click.group()
def cli():
pass


@cli.command()
def github_release():
print("getting auth and config")
with open("./creds.json") as file:
auth = json.load(file)

with open("./dist.json") as file:
config = json.load(file)
print("auth and config loaded")

print("logging into GitHub")
g = Github(auth["github_token"])
repo = g.get_repo(f"{config['GH_user']}/{config['name']}")
master_branch = repo.get_branch("master")

print("Creating release")
# https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html#github.Repository.Repository.create_git_release
repo.create_git_release(
tag=config["version"],
name=config["GH_release_name"],
message=config["GH_release_message"],
draft=False,
prerelease=False,
target_commitish=master_branch,
)
print("All done!")


@cli.command()
@click.option("--sha256", required=True, type=str)
def render_conda(sha256):
print("jinja rendering conda template file called meta.template.yaml ...")
with open("./dist.json", "r") as file:
config = json.load(file)

env = Environment(loader=FileSystemLoader("."))
t = env.get_template("meta.template.yaml")

rendered = t.render(
name=config["name"],
version=config["version"],
sha256val=sha256,
creator=config["GH_user"],
dependencies=config["dependencies"],
PYTHON="{{ PYTHON }}",
)
with open("./meta.yaml", "wt") as yaml_file:
yaml_file.write(rendered)
print("all done - file rendered and saved as meta.yaml")


if __name__ == "__main__":
cli()
42 changes: 42 additions & 0 deletions meta.template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package:
name: {{ name|lower }}
version: "{{ version }}"

source:
url: https://github.com/{{ creator }}/{{ name }}/archive/{{ version }}.tar.gz
sha256: {{ sha256val }}

build:
noarch: python
number: 0
script: "{{ PYTHON }} -m pip install . -vv"

requirements:
host:
- python >=3.6
- pip
- setuptools

run: {% for dep in dependencies %}
- {{ dep }} {% endfor %}

test:
imports:
- bcpandas

about:
home: https://pypi.org/project/{{ name }}
license: MIT
license_family: MIT
license_file: LICENSE
summary: Wrapper around BCP to transfer data between pandas and SQL Server.
description: |
High-level wrapper around BCP for high performance data transfers between pandas and SQL Server.
No knowledge of BCP required!!
doc_url: https://github.com/{{ creator }}/{{ name }}
dev_url: https://github.com/{{ creator }}/{{ name }}

extra:
recipe-maintainers:
- {{ creator }}

22 changes: 14 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import json
from setuptools import setup, find_packages

with open("README.md", "r") as fh:
long_description = fh.read()
with open("./dist.json", "r") as file:
config = json.load(file)

with open("./README.md", "r") as file:
long_description = file.read()


setup(
name="bcpandas",
version="0.1.2",
author="Josh Dimarsky",
description="High-level wrapper around BCP for high performance data transfers between pandas and SQL Server. No knowledge of BCP required!!",
name=config["name"],
version=config["version"],
author=config["author"],
description=config["short_description"],
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/yehoshuadimarsky/bcpandas",
url=f"https://github.com/{config['GH_user']}/{config['name']}",
packages=find_packages(exclude=["tests.*", "tests"]),
python_requires=">=3.6",
python_requires=[x for x in config["dependencies"] if x.startswith("python ")],
install_requires=[x for x in config["dependencies"] if not x.startswith("python ")],
keywords="bcp mssql pandas",
classifiers=[
"Topic :: Database",
Expand Down
Loading

0 comments on commit 5c35f22

Please sign in to comment.