forked from IBM/db2-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
8 changed files
with
13,580 additions
and
1 deletion.
There are no files selected for viewing
1,001 changes: 1,001 additions & 0 deletions
1,001
...k for Data Trained ML Pipeline/Cloud Pak for Data Assets/customer_full_summary_latest.csv
Large diffs are not rendered by default.
Oops, something went wrong.
312 changes: 312 additions & 0 deletions
312
... Pak for Data Trained ML Pipeline/Cloud Pak for Data Assets/customer_segmentation_prep.py
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file added
BIN
+1.04 KB
... Cloud Pak for Data Trained ML Pipeline/Cloud Pak for Data Assets/original_csv_dtypes.pkl
Binary file not shown.
10,001 changes: 10,001 additions & 0 deletions
10,001
...with a Cloud Pak for Data Trained ML Pipeline/Cloud Pak for Data Assets/test_data_10K.csv
Large diffs are not rendered by default.
Oops, something went wrong.
285 changes: 285 additions & 0 deletions
285
...b2 Scoring with a Cloud Pak for Data Trained ML Pipeline/Notebooks/Dataset Creation.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "97aed183-c9b5-488c-958e-0a38f1a0e88e" | ||
}, | ||
"source": [ | ||
"# Inserting Training and Test Data into a Db2 Table" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "856ae9ae-ff30-4599-aa8a-2734cfca6d1b" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Establishing Connection to Database\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import ibm_db_dbi\n", | ||
"import ibm_db\n", | ||
"from project_lib import Project \n", | ||
"\n", | ||
"# Define connection string and connect to database\n", | ||
"project = Project.access()\n", | ||
"LocalDB2_credentials = project.get_connection(name = \"CSSDB3\")\n", | ||
"\n", | ||
"bluedb_connection = ibm_db.connect(\"DATABASE={};HOSTNAME={};PORT={};PROTOCOL=TCPIP;UID={};PWD={}\".format(LocalDB2_credentials['database'],\n", | ||
" LocalDB2_credentials['host'],\n", | ||
" LocalDB2_credentials['port'],\n", | ||
" LocalDB2_credentials['username'],\n", | ||
" LocalDB2_credentials['password']),\"\",\"\")\n", | ||
"dbi_bluedb_connection = ibm_db_dbi.Connection(bluedb_connection)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Insert Training Data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "47cc7326-94cb-4b7e-98e0-efbf37c4973f" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Reading in CSV file with raw data and preparing to write to DB2 \n", | ||
"\n", | ||
"train_data = pd.read_csv('/project_data/data_asset/customer_full_summary_latest.csv')\n", | ||
"\n", | ||
"train_data = test_data.where(pd.notnull(train_data),None)\n", | ||
"\n", | ||
"tuple_of_tuples = tuple([tuple(x) for x in train_data.values])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "19ae6527-d3f8-4622-b8e8-3acdcb617f3e" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Creating string concatenations for sql query to write data \n", | ||
"\n", | ||
"type_mapping = {'int64': 'DOUBLE',\n", | ||
" 'float64': 'DOUBLE',\n", | ||
" 'object': 'VARCHAR(90)',\n", | ||
" 'datetime64[ns]': 'VARCHAR(90)'\n", | ||
" }\n", | ||
"\n", | ||
"value_string = ', '.join(['?' for i in range(len(train_data.columns))])\n", | ||
"dtype_mapping_4table = [type_mapping[str(x)] for x in list(train_data.dtypes)]\n", | ||
"table_creation_column_string = ', '.join('{0} {1} '.format(str(x),str(y)) for x,y in zip(train_data.columns,dtype_mapping_4table))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "48fde0cb-e315-4727-9f8a-11656d7deef2" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"sql = 'CREATE TABLE DSE.CUST_SEG_DATA_TRAIN({});'.format(table_creation_column_string)\n", | ||
"\n", | ||
"stmt = ibm_db.prepare(bluedb_connection, sql)\n", | ||
"\n", | ||
"ibm_db.execute(stmt)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "e6a427d2-dfd3-47ab-ac33-339b5025bc53" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"value_string = ', '.join(['?' for i in range(len(train_data.columns))])\n", | ||
"\n", | ||
"sql = \"INSERT INTO DSE.CUST_SEG_DATA_TRAIN VALUES({})\".format(value_string)\n", | ||
"\n", | ||
"stmt = ibm_db.prepare(bluedb_connection, sql)\n", | ||
"\n", | ||
"ibm_db.execute_many(stmt, tuple_of_tuples)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "eb225a5b-30c8-4e30-abc0-ae2dd7e8e4fa" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Validate successful data import\n", | ||
"input_df = pd.read_sql('select * from DSE.CUST_SEG_DATA_TRAIN',con = dbi_bluedb_connection)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "3f87eb38-020b-4d04-9d7a-77040c9e6e71", | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"input_df.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "cceef222-6910-4a37-8452-84acf385295b", | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"input_df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Insert Test Data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "47cc7326-94cb-4b7e-98e0-efbf37c4973f" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Reading in CSV file with raw data and preparing to write to DB2 \n", | ||
"\n", | ||
"test_data = pd.read_csv('/project_data/data_asset/test_data_10K.csv')\n", | ||
"\n", | ||
"test_data = test_data.where(pd.notnull(test_data),None)\n", | ||
"\n", | ||
"tuple_of_tuples = tuple([tuple(x) for x in test_data.values])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "19ae6527-d3f8-4622-b8e8-3acdcb617f3e" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"#Creating string concatenations for sql query to write data \n", | ||
"\n", | ||
"type_mapping = {'int64': 'DOUBLE',\n", | ||
" 'float64': 'DOUBLE',\n", | ||
" 'object': 'VARCHAR(90)',\n", | ||
" 'datetime64[ns]': 'VARCHAR(90)'\n", | ||
" }\n", | ||
"\n", | ||
"value_string = ', '.join(['?' for i in range(len(test_data.columns))])\n", | ||
"dtype_mapping_4table = [type_mapping[str(x)] for x in list(test_data.dtypes)]\n", | ||
"table_creation_column_string = ', '.join('{0} {1} '.format(str(x),str(y)) for x,y in zip(test_data.columns,dtype_mapping_4table))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "48fde0cb-e315-4727-9f8a-11656d7deef2" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"sql = 'CREATE TABLE DSE.CUST_SEG_DATA_TEST({});'.format(table_creation_column_string)\n", | ||
"\n", | ||
"stmt = ibm_db.prepare(bluedb_connection, sql)\n", | ||
"\n", | ||
"ibm_db.execute(stmt)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "e6a427d2-dfd3-47ab-ac33-339b5025bc53" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"value_string = ', '.join(['?' for i in range(len(test_data.columns))])\n", | ||
"\n", | ||
"sql = \"INSERT INTO DSE.CUST_SEG_DATA_TEST VALUES({})\".format(value_string)\n", | ||
"\n", | ||
"stmt = ibm_db.prepare(bluedb_connection, sql)\n", | ||
"\n", | ||
"ibm_db.execute_many(stmt, tuple_of_tuples)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "eb225a5b-30c8-4e30-abc0-ae2dd7e8e4fa" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Validate successful data import\n", | ||
"input_df = pd.read_sql('select * from DSE.CUST_SEG_DATA_TEST',con = dbi_bluedb_connection)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "3f87eb38-020b-4d04-9d7a-77040c9e6e71" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"input_df.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "cceef222-6910-4a37-8452-84acf385295b" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"input_df.head()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.0" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
Oops, something went wrong.