diff --git a/.gitignore b/.gitignore index 7b2cec702..c3a3a26d0 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ data-profiler-py3/ DataProfiler/ +feature_branch/ + build/ dist/ @@ -37,4 +39,4 @@ venv/ *.orig dataprofiler/labelers/embeddings/glove-reduced-64D.txt -.torchignore \ No newline at end of file +.torchignore diff --git a/README.md b/README.md index 74697683c..63e002386 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ date, then clone the dataprofiler from the feature branch you want in order to update the documentation (probably main). # clone the feature branch into the base of this repository - git clone https://github.com/capitalone/DataProfiler DataProfiler + git clone https://github.com/capitalone/DataProfiler feature_branch Install requirements: @@ -47,9 +47,9 @@ Install requirements: pip install requirements.txt # install the requirements from the feature branch - pip install DataProfiler/requirements.txt - pip install DataProfiler/requirements-ml.txt - pip install DataProfiler/requirements-reports.txt + pip install feature_branch/requirements.txt + pip install feature_branch/requirements-ml.txt + pip install feature_branch/requirements-reports.txt In /docs run: diff --git a/docs/0.7.1/doctrees/API.doctree b/docs/0.7.1/doctrees/API.doctree new file mode 100644 index 000000000..e54b87384 Binary files /dev/null and b/docs/0.7.1/doctrees/API.doctree differ diff --git a/docs/0.7.1/doctrees/add_new_model_to_data_labeler.doctree b/docs/0.7.1/doctrees/add_new_model_to_data_labeler.doctree new file mode 100644 index 000000000..b1e9f672d Binary files /dev/null and b/docs/0.7.1/doctrees/add_new_model_to_data_labeler.doctree differ diff --git a/docs/0.7.1/doctrees/data_labeling.doctree b/docs/0.7.1/doctrees/data_labeling.doctree new file mode 100644 index 000000000..2372ae10b Binary files /dev/null and b/docs/0.7.1/doctrees/data_labeling.doctree differ diff --git a/docs/0.7.1/doctrees/data_reader.doctree b/docs/0.7.1/doctrees/data_reader.doctree new file mode 100644 index 000000000..82346e4de Binary files /dev/null and b/docs/0.7.1/doctrees/data_reader.doctree differ diff --git a/docs/0.7.1/doctrees/data_readers.doctree b/docs/0.7.1/doctrees/data_readers.doctree new file mode 100644 index 000000000..92e3d3323 Binary files /dev/null and b/docs/0.7.1/doctrees/data_readers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.avro_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.avro_data.doctree new file mode 100644 index 000000000..cb0baa08a Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.avro_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.base_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.base_data.doctree new file mode 100644 index 000000000..810b0d673 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.base_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.csv_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.csv_data.doctree new file mode 100644 index 000000000..5c22d4d17 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.csv_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.data.doctree new file mode 100644 index 000000000..bfdc360ad Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.data_utils.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.data_utils.doctree new file mode 100644 index 000000000..5c76eb001 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.data_utils.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.doctree new file mode 100644 index 000000000..4d64894e8 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree new file mode 100644 index 000000000..e412e8162 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.filepath_or_buffer.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.json_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.json_data.doctree new file mode 100644 index 000000000..38f117153 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.json_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.parquet_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.parquet_data.doctree new file mode 100644 index 000000000..ecab1357c Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.parquet_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.structured_mixins.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.structured_mixins.doctree new file mode 100644 index 000000000..4dfed87f7 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.structured_mixins.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.data_readers.text_data.doctree b/docs/0.7.1/doctrees/dataprofiler.data_readers.text_data.doctree new file mode 100644 index 000000000..8bb7c100b Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.data_readers.text_data.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.doctree b/docs/0.7.1/doctrees/dataprofiler.doctree new file mode 100644 index 000000000..55046050b Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.dp_logging.doctree b/docs/0.7.1/doctrees/dataprofiler.dp_logging.doctree new file mode 100644 index 000000000..9da8ca8f3 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.dp_logging.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.base_data_labeler.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.base_data_labeler.doctree new file mode 100644 index 000000000..d015e4090 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.base_data_labeler.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.base_model.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.base_model.doctree new file mode 100644 index 000000000..2e6e984f2 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.base_model.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree new file mode 100644 index 000000000..4c991aa67 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.character_level_cnn_model.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.classification_report_utils.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.classification_report_utils.doctree new file mode 100644 index 000000000..83a3e2adf Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.classification_report_utils.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.data_labelers.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.data_labelers.doctree new file mode 100644 index 000000000..c095c3238 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.data_labelers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.data_processing.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.data_processing.doctree new file mode 100644 index 000000000..7072b4208 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.data_processing.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.doctree new file mode 100644 index 000000000..76191f18c Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.labeler_utils.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.labeler_utils.doctree new file mode 100644 index 000000000..8c1607828 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.labeler_utils.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.labelers.regex_model.doctree b/docs/0.7.1/doctrees/dataprofiler.labelers.regex_model.doctree new file mode 100644 index 000000000..6b2432ab8 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.labelers.regex_model.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.base_column_profilers.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.base_column_profilers.doctree new file mode 100644 index 000000000..05a52d82a Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.base_column_profilers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.categorical_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.categorical_column_profile.doctree new file mode 100644 index 000000000..5436dba48 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.categorical_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.column_profile_compilers.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.column_profile_compilers.doctree new file mode 100644 index 000000000..913b6171e Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.column_profile_compilers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree new file mode 100644 index 000000000..e290db8f0 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.data_labeler_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.datetime_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.datetime_column_profile.doctree new file mode 100644 index 000000000..15df826b2 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.datetime_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.doctree new file mode 100644 index 000000000..16333a2e1 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.float_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.float_column_profile.doctree new file mode 100644 index 000000000..f20e5c32d Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.float_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.doctree new file mode 100644 index 000000000..dbb8cea10 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree new file mode 100644 index 000000000..d58197186 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.helpers.report_helpers.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.histogram_utils.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.histogram_utils.doctree new file mode 100644 index 000000000..b77c904bf Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.histogram_utils.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.int_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.int_column_profile.doctree new file mode 100644 index 000000000..6d96b54a5 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.int_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.numerical_column_stats.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.numerical_column_stats.doctree new file mode 100644 index 000000000..e7aef6c85 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.numerical_column_stats.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.order_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.order_column_profile.doctree new file mode 100644 index 000000000..e94606df7 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.order_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.profile_builder.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.profile_builder.doctree new file mode 100644 index 000000000..89c08cf81 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.profile_builder.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.profiler_options.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.profiler_options.doctree new file mode 100644 index 000000000..5b40fdb3e Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.profiler_options.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.text_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.text_column_profile.doctree new file mode 100644 index 000000000..239be616e Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.text_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_data_labeler_column_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_data_labeler_column_profile.doctree new file mode 100644 index 000000000..072cf3361 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_data_labeler_column_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree new file mode 100644 index 000000000..91eae205f Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_labeler_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree new file mode 100644 index 000000000..e0d30af7f Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.unstructured_text_profile.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.profilers.utils.doctree b/docs/0.7.1/doctrees/dataprofiler.profilers.utils.doctree new file mode 100644 index 000000000..f32d910a3 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.profilers.utils.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.reports.doctree b/docs/0.7.1/doctrees/dataprofiler.reports.doctree new file mode 100644 index 000000000..711163055 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.reports.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.reports.graphs.doctree b/docs/0.7.1/doctrees/dataprofiler.reports.graphs.doctree new file mode 100644 index 000000000..a76b026cc Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.reports.graphs.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.settings.doctree b/docs/0.7.1/doctrees/dataprofiler.settings.doctree new file mode 100644 index 000000000..75faf4f39 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.settings.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.validators.base_validators.doctree b/docs/0.7.1/doctrees/dataprofiler.validators.base_validators.doctree new file mode 100644 index 000000000..70cc90035 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.validators.base_validators.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.validators.doctree b/docs/0.7.1/doctrees/dataprofiler.validators.doctree new file mode 100644 index 000000000..3e950cd53 Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.validators.doctree differ diff --git a/docs/0.7.1/doctrees/dataprofiler.version.doctree b/docs/0.7.1/doctrees/dataprofiler.version.doctree new file mode 100644 index 000000000..b004e1f9f Binary files /dev/null and b/docs/0.7.1/doctrees/dataprofiler.version.doctree differ diff --git a/docs/0.7.1/doctrees/environment.pickle b/docs/0.7.1/doctrees/environment.pickle new file mode 100644 index 000000000..231e83ebf Binary files /dev/null and b/docs/0.7.1/doctrees/environment.pickle differ diff --git a/docs/0.7.1/doctrees/examples.doctree b/docs/0.7.1/doctrees/examples.doctree new file mode 100644 index 000000000..514bb777e Binary files /dev/null and b/docs/0.7.1/doctrees/examples.doctree differ diff --git a/docs/0.7.1/doctrees/graphs.doctree b/docs/0.7.1/doctrees/graphs.doctree new file mode 100644 index 000000000..f616fe8c6 Binary files /dev/null and b/docs/0.7.1/doctrees/graphs.doctree differ diff --git a/docs/0.7.1/doctrees/index.doctree b/docs/0.7.1/doctrees/index.doctree new file mode 100644 index 000000000..4dd1126e3 Binary files /dev/null and b/docs/0.7.1/doctrees/index.doctree differ diff --git a/docs/0.7.1/doctrees/install.doctree b/docs/0.7.1/doctrees/install.doctree new file mode 100644 index 000000000..f2a4f8194 Binary files /dev/null and b/docs/0.7.1/doctrees/install.doctree differ diff --git a/docs/0.7.1/doctrees/labeler.doctree b/docs/0.7.1/doctrees/labeler.doctree new file mode 100644 index 000000000..7b9c2ed25 Binary files /dev/null and b/docs/0.7.1/doctrees/labeler.doctree differ diff --git a/docs/0.7.1/doctrees/modules.doctree b/docs/0.7.1/doctrees/modules.doctree new file mode 100644 index 000000000..32edee832 Binary files /dev/null and b/docs/0.7.1/doctrees/modules.doctree differ diff --git a/docs/0.7.1/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb b/docs/0.7.1/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb new file mode 100644 index 000000000..3f59297bc --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/add_new_model_to_data_labeler.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "228bb2a6", + "metadata": {}, + "source": [ + "# Adding new model to the existing DataLabeler pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "cab7a569", + "metadata": {}, + "source": [ + "Consider the case when we would like to explore different character-level neural network models and evaluate their performance on different datasets. The existing DataLabeler in the DataProfiler library already contains a preprocessor, a postprocessor, and a character-level CNN (Convolutional Neural Network) model that are combined to work on such data. All we need is to build additional model classes that inherit the main functionalities from the CNN model and also adapt the model construction to the desired architectures. In this example, we define such a new model to be used with the Data Labeler component of the Data Profiler. In particular, a character-level LSTM (Long Short-Term Memory) model is implemented, then integrated into the DataLabeler pipeline to be trained with a tabular dataset. The process includes the following steps:\n", + "\n", + " - Build a new character-level LSTM model that inherits the CNN model\n", + " - Load the DataLabeler from the DataProfiler\n", + " - Swap the existing CNN model with the new LSTM model\n", + " - Train the data labeler pipeline on a given dataset\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16624c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "e90728ab", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "3d61981c", + "metadata": {}, + "source": [ + "In this example, we use a structured dataset, the aws honeypot dataset, given in the test folder of the library. This dataset is first read by the Data Reader class of the Data Profiler, then split into training and test data to be used in the next sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f031fe06", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(\"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\")\n", + "df_data = data.data\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df_data = df_data.sample(frac=1).reset_index(drop=True)\n", + "data_train = df_data[:int((1 - split_ratio) * len(df_data))]\n", + "data_test = df_data[int((1 - split_ratio) * len(df_data)):]\n", + "\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "745ed0d4", + "metadata": {}, + "source": [ + "## Implement a new character-level LSTM model" + ] + }, + { + "cell_type": "markdown", + "id": "7375b0c0", + "metadata": {}, + "source": [ + "This new model is inherited from `CharacterLevelCnnModel` class, with some modifications on the following functions\n", + "\n", + "`__init__`: to add new parameters for the LSTM model. The new parameters, `size_lstm`, `rec_dropout`, `activation`, `recurrent_activation`, specify number of LSTM layers, activation function, and recurrent dropout ratio.\n", + "\n", + "`_validate_parameters`: to add additional checks on the new parameters for the LSTM model\n", + "\n", + "`_construct_model`: to construct the new LSTM model with the desired architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8568fb49", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \\\n", + " create_glove_char, build_embd_dictionary\n", + "from dataprofiler.labelers.base_model import BaseModel\n", + "\n", + "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", + "#########################################################\n", + "#########################################################\n", + "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", + "\n", + " # boolean if the label mapping requires the mapping for index 0 reserved\n", + " requires_zero_mapping = True\n", + "\n", + " def __init__(self, label_mapping=None, parameters=None):\n", + " \"\"\"\n", + " LSTM Model Initializer\n", + " \"\"\"\n", + " \n", + " # parameter initialization\n", + " if not parameters:\n", + " parameters = {}\n", + " parameters.setdefault('max_length', 3400)\n", + " parameters.setdefault('max_char_encoding_id', 127)\n", + " parameters.setdefault('dim_embed', 64)\n", + " parameters.setdefault('size_fc', [32, 32])\n", + " parameters.setdefault('dropout', 0.1)\n", + " # new parameters for LSTM model\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault('size_lstm', [64])\n", + " parameters.setdefault('rec_dropout', 0.1)\n", + " parameters.setdefault('activation', \"tanh\")\n", + " parameters.setdefault('recurrent_activation', \"sigmoid\")\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault('default_label', \"UNKNOWN\")\n", + " parameters['pad_label'] = 'PAD'\n", + " self._epoch_id = 0\n", + "\n", + " # reconstruct flags for model\n", + " self._model_num_labels = 0\n", + " self._model_default_ind = -1\n", + "\n", + " BaseModel.__init__(self, label_mapping, parameters)\n", + "\n", + " def _validate_parameters(self, parameters):\n", + " \"\"\"\n", + " Validate the parameters sent in. Raise error if invalid parameters are\n", + " present.\n", + " \"\"\"\n", + " errors = []\n", + " list_of_necessary_params = ['max_length', 'max_char_encoding_id',\n", + " 'dim_embed', 'size_fc', 'dropout',\n", + " 'size_lstm', 'rec_dropout', 'activation', \n", + " 'recurrent_activation', 'default_label', \n", + " 'pad_label']\n", + " # Make sure the necessary parameters are present and valid.\n", + " for param in parameters:\n", + " if param in ['max_length', 'max_char_encoding_id', 'dim_embed',\n", + " 'size_conv']:\n", + " if not isinstance(parameters[param], (int, float)) \\\n", + " or parameters[param] < 0:\n", + " errors.append(param + \" must be a valid integer or float \"\n", + " \"greater than 0.\")\n", + " elif param in ['dropout', 'rec_dropout']: # additional check for rec_dropout\n", + " if not isinstance(parameters[param], (int, float)) \\\n", + " or parameters[param] < 0 or parameters[param] > 1:\n", + " errors.append(param + \" must be a valid integer or float \"\n", + " \"from 0 to 1.\")\n", + " elif param == 'size_fc' or param == 'size_lstm': # additional check for size_lstm\n", + " if not isinstance(parameters[param], list) \\\n", + " or len(parameters[param]) == 0:\n", + " errors.append(param + \" must be a non-empty list of \"\n", + " \"integers.\")\n", + " else:\n", + " for item in parameters[param]:\n", + " if not isinstance(item, int):\n", + " errors.append(param + \" must be a non-empty \"\n", + " \"list of integers.\")\n", + " break\n", + " elif param in ['default_label', 'activation', 'recurrent_activation']: # additional check for activation and recurrent_activation\n", + " if not isinstance(parameters[param], str):\n", + " error = str(param) + \" must be a string.\"\n", + " errors.append(error)\n", + "\n", + " # Error if there are extra parameters thrown in\n", + " for param in parameters:\n", + " if param not in list_of_necessary_params:\n", + " errors.append(param + \" is not an accepted parameter.\")\n", + " if errors:\n", + " raise ValueError('\\n'.join(errors))\n", + "\n", + " def _construct_model(self):\n", + " \"\"\"\n", + " Model constructor for the data labeler. This also serves as a weight\n", + " reset.\n", + "\n", + " :return: None\n", + " \"\"\"\n", + " num_labels = self.num_labels\n", + " default_ind = self.label_mapping[self._parameters['default_label']]\n", + "\n", + " # Reset model\n", + " tf.keras.backend.clear_session()\n", + "\n", + " # generate glove embedding\n", + " create_glove_char(self._parameters['dim_embed'])\n", + "\n", + " # generate model\n", + " self._model = tf.keras.models.Sequential()\n", + "\n", + " # default parameters\n", + " max_length = self._parameters['max_length']\n", + " max_char_encoding_id = self._parameters['max_char_encoding_id']\n", + "\n", + " # Encoding layer\n", + " def encoding_function(input_str):\n", + " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", + " input_str, max_char_encoding_id, max_length)\n", + " return char_in_vector\n", + "\n", + " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Lambda(encoding_function,\n", + " output_shape=tuple([max_length])))\n", + "\n", + " # Create a pre-trained weight matrix\n", + " # character encoding indices range from 0 to max_char_encoding_id,\n", + " # we add one extra index for out-of-vocabulary character\n", + " embed_file = os.path.join(\n", + " \"../dataprofiler/labelers\", \"embeddings/glove-reduced-{}D.txt\".format(\n", + " self._parameters['dim_embed']))\n", + " embedding_matrix = np.zeros((max_char_encoding_id + 2,\n", + " self._parameters['dim_embed']))\n", + " embedding_dict = build_embd_dictionary(embed_file)\n", + "\n", + " input_shape = tuple([max_length])\n", + " # Fill in the weight matrix: let pad and space be 0s\n", + " for ascii_num in range(max_char_encoding_id):\n", + " if chr(ascii_num) in embedding_dict:\n", + " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", + "\n", + " self._model.add(tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters['dim_embed'],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True))\n", + " \n", + " # Add the lstm layers\n", + " #########################################################\n", + " #########################################################\n", + " for size in self._parameters['size_lstm']:\n", + " self._model.add(\n", + " tf.keras.layers.LSTM(units=size, \n", + " recurrent_dropout=self._parameters['rec_dropout'], \n", + " activation=self._parameters['activation'],\n", + " recurrent_activation=self._parameters['recurrent_activation'],\n", + " return_sequences=True))\n", + " if self._parameters['dropout']:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " #########################################################\n", + " #########################################################\n", + "\n", + " # Add the fully connected layers\n", + " for size in self._parameters['size_fc']:\n", + " self._model.add(\n", + " tf.keras.layers.Dense(units=size, activation='relu'))\n", + " if self._parameters['dropout']:\n", + " self._model.add(\n", + " tf.keras.layers.Dropout(self._parameters['dropout']))\n", + "\n", + " # Add the final Softmax layer\n", + " self._model.add(\n", + " tf.keras.layers.Dense(num_labels, activation='softmax'))\n", + "\n", + " # Output the model into a .pb file for TensorFlow\n", + " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", + "\n", + " # Create confidence layers\n", + " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", + " num_labels, threshold=0.0, default_ind=default_ind)\n", + "\n", + " argmax_outputs = self._model.outputs + \\\n", + " [argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output)]\n", + " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", + "\n", + " # Compile the model\n", + " softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]\n", + " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", + "\n", + " # use f1 score metric\n", + " f1_score_training = F1Score(num_classes=num_labels, average='micro')\n", + " metrics = {softmax_output_layer_name: ['acc', f1_score_training]}\n", + "\n", + " self._model.compile(loss=losses,\n", + " optimizer=\"adam\",\n", + " metrics=metrics)\n", + "\n", + " self._epoch_id = 0\n", + " self._model_num_labels = num_labels\n", + " self._model_default_ind = default_ind\n" + ] + }, + { + "cell_type": "markdown", + "id": "d66bd25c", + "metadata": {}, + "source": [ + "## Integrate the new LSTM model to the DataLabeler" + ] + }, + { + "cell_type": "markdown", + "id": "479f407a", + "metadata": {}, + "source": [ + "Once the LSTM model is built, it replaces the existing model in the DataLabeler pipeline, which is then trained on the given dataset. Note that, as the DataLabeler is trained on the above tabular dataset, its label mapping is updated by the list of column names in that dataset while training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb482ffe", + "metadata": {}, + "outputs": [], + "source": [ + "# get labels from the given dataset\n", + "value_label_df = data_train.reset_index(drop=True).melt()\n", + "value_label_df.columns = [1, 0] # labels=1, values=0 in that order\n", + "value_label_df = value_label_df.astype(str)\n", + "labels = value_label_df[1].unique().tolist()\n", + "\n", + "# create a new LSTM model\n", + "# set default label (one of the column names) to the model\n", + "model = CharacterLevelLstmModel(label_mapping=labels, parameters={'default_label': 'comment'})\n", + "\n", + "# add the new LSTM model to the data labeler\n", + "data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "data_labeler.set_model(model)\n", + "\n", + "# set default label (one of the column names) to the preprocessor and postprocessor\n", + "processor_params = {'default_label': 'comment'}\n", + "data_labeler._preprocessor.set_params(**processor_params)\n", + "data_labeler._postprocessor.set_params(**processor_params)\n", + "\n", + "# train the data labeler\n", + "save_dirpath=\"data_labeler_saved\"\n", + "if not os.path.exists(save_dirpath):\n", + " os.makedirs(save_dirpath)\n", + "\n", + "epochs=2\n", + "data_labeler.fit(\n", + " x=value_label_df[0], y=value_label_df[1], labels=labels, epochs=epochs)\n", + "if save_dirpath:\n", + " data_labeler.save_to_disk(save_dirpath)" + ] + }, + { + "cell_type": "markdown", + "id": "14b78c69", + "metadata": {}, + "source": [ + "The trained Data Labeler is then used by the Data Profiler to provide the prediction on the new dataset. In this example, all options except data labeler are disabled for the sake of presenting data labeler functionality. The results are given in the columnar format where true column types are given in the first column, and the predicted column labels are given in the second column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdfcf1d2", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the data labeler object\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "cc60ff8a", + "metadata": {}, + "source": [ + "In summary, users can define their own model, plug it in the DataLabeler pipeline, and train the labeler with the new dataset. Above, we show one example of adding the LSTM model to the pipeline. Interested users can implement other neural network models as desired with the same process." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/nbsphinx/data_reader.ipynb b/docs/0.7.1/doctrees/nbsphinx/data_reader.ipynb new file mode 100644 index 000000000..731e46d25 --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/data_reader.ipynb @@ -0,0 +1,621 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4d79832-59ab-410a-ad6d-fbba01a3f0d3", + "metadata": {}, + "source": [ + "# Intro to Data Readers\n", + "Within the Data Profiler, there are 5 data reader classes:\n", + "\n", + " * CSVData (delimited data: CSV, TSV, etc.)\n", + " * JSONData\n", + " * ParquetData\n", + " * AVROData\n", + " * TextData\n", + " \n", + "Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the `Data` class.\n", + "```python\n", + "import dataprofiler as dp\n", + "data = dp.Data('/path/to/mydata.abc') # auto detects and reads your data\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f2315666-20be-4937-9f9a-26d42dc135e2", + "metadata": { + "tags": [] + }, + "source": [ + "## Automatically reading and detecting data\n", + "\n", + "Below is a demonstration of utilizing the `Data` class which automatically detects the type of data for a given file and reads it automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e61c6c-43b8-4700-b627-759b5ef8bdda", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8821ad8d-b2c0-489c-ae6a-54c11b7f0a08", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "all_files = csv_files + json_files + parquet_files + avro_files + text_files\n", + "print('filepath' + ' ' * 58 + 'data type')\n", + "print('='*80)\n", + "for file in all_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " data = dp.Data(filepath)\n", + " print(\"{:<65} {:<15}\".format(file, data.data_type))\n", + "print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "77f8ef2d-5aaf-44d6-b6d1-bf14f7eb7aa6", + "metadata": {}, + "source": [ + "## Specifying detection options of `Data` and loading `pandas.DataFrame`\n", + "\n", + "The `Data` class also gives the ability to set options or if the user wants to load their data with specific requirements.\n", + "Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "\n", + "options = {...} # allowed options are specified for each data reader.\n", + "data = dp.Data(data, options=options)\n", + "```\n", + "Later in this tutorial, the options for the CSVData class will be discussed.\n", + "\n", + "Additionally, a user can directly load a `pandas.DataFrame` as any data reader they choose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b925d4e-ca94-4913-9acf-26a883585e85", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "df = pd.DataFrame(['my', 'random', 'data'])\n", + "\n", + "# specify via the `Data` class\n", + "data = dp.Data(data=df, data_type='csv')\n", + "print('Data Type: ', data.data_type)\n", + "\n", + "# specifically use the CSVData class\n", + "data = CSVData(data=df)\n", + "print('Data Type: ', data.data_type)" + ] + }, + { + "cell_type": "markdown", + "id": "52c3c3ac-c241-4d91-8ac7-b3d28ffd19c3", + "metadata": {}, + "source": [ + "## Accessing data and attributes\n", + "\n", + "Once loaded, the data can be accessed via the `data` property of the object. Additional information about the data loaded may differ between data readers.\n", + "\n", + "For this example we will focus on `CSVData`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fa5929-e710-4107-9313-1370ab639c9c", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "data = dp.Data(filepath)\n", + "print('Data Type: ', data.data_type)\n", + "print('Data Filepath: ', data.input_file_path)\n", + "print('File Encoding: ', data.file_encoding)\n", + "print('Data Length (two techniques): ', len(data), data.length)\n", + "print(\"Data Access:\")\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "b98be971-4768-479d-9e54-00f05a6fb790", + "metadata": {}, + "source": [ + "## Checking data file types with `is_match`\n", + "\n", + "Each data reader has a class method `is_match` which determines whether or not a dataset is of a given data type.\n", + "```python\n", + "CSVData.is_match\n", + "JSONData.is_match\n", + "ParquetData.is_match\n", + "AVROData.is_match\n", + "TextData.is_match\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104a32c1-1d50-4aa5-94ce-b2e72de38476", + "metadata": {}, + "outputs": [], + "source": [ + "# supplemental function\n", + "def add_true_false_color(value):\n", + " \"\"\"Converts True to green and False to red in printed text.\"\"\"\n", + " if value:\n", + " return \"\\x1b[92m \" + str(is_match) + \"\\x1b[0m\"\n", + " return \"\\x1b[31m \" + str(is_match) + \"\\x1b[0m\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06868d90-2726-4096-a6da-3866174e6671", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "non_csv_files = [\n", + " 'json/iris-utf-8.json',\n", + " 'json/honeypot_intentially_mislabeled_file.csv',\n", + " 'parquet/titanic.parq',\n", + " 'parquet/nation.plain.intentionally_mislabled_file.csv',\n", + " 'txt/code.txt',\n", + " 'txt/sentence.txt',\n", + " 'avro/users.avro',\n", + " 'avro/snappy_compressed_intentionally_mislabeled_file.csv',\n", + "]\n", + "\n", + "print(\"Is the file a CSV?\")\n", + "print('=' * 80)\n", + "for file in csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)\n", + " \n", + "for file in non_csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "38889990-8e19-4114-a4f3-dc2af938e29d", + "metadata": {}, + "source": [ + "## Reloading data after altering options with `reload`\n", + "\n", + "There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:\n", + "\n", + " 1. The options were not correctly determined\n", + " 2. The options were loaded correctly but a change is desired.\n", + " \n", + "In the example below, the `data_format` for reading the data is changed and the data is then reloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01870e8d-45ee-4f33-a088-4453c7ffc7c2", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "\n", + "data = dp.Data(filepath)\n", + "print('original data:')\n", + "print('=' * 80)\n", + "print(data.data[:5])\n", + "\n", + "print()\n", + "data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})\n", + "print('reloaded data:')\n", + "print('=' * 80)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e2285f19-9b34-4484-beaa-79df890b2825", + "metadata": {}, + "source": [ + "## A deeper dive into `CSVData`\n", + "\n", + "The rest of this tutorial will focus on how to use the data reader class: `CSVData`. The `CSVData` class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the `,`. E.g. from the `diamonds.csv` dataset:\n", + "```\n", + "carat,cut,color,clarity,depth,table,price,x,y,z\n", + "0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43\n", + "0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31\n", + "0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31\n", + "0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63\n", + "0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75\n", + "```\n", + "\n", + "However, the delimiter can be any character. Additionally, a `quotechar`, commonly `\"`, can be specified which allows a delimiter to be contained within a column value.\n", + "E.g. from the `blogposts.csv` dataset:\n", + "```\n", + "Blog Post,Date,Subject,Field\n", + "\"Monty Hall, meet Game Theory\",4/13/2014,Statistics,Mathematics\n", + "Gaussian Quadrature,4/13/2014,Algorithms,Mathematics\n", + "```\n", + "Notice how `\"Monty Hall, meet Game Theory\"` is contained by the quotechar because it contains the delimiter value `,`.\n", + "\n", + "These delimiter dataset parameters (and more) can be automatically determined by the `CSVData` data reader, however they can also be set via the options as demonstrated later in this tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "cccb6bf9-7fb8-46b8-992e-9caacb7ab3a8", + "metadata": {}, + "source": [ + "## Intro to the `CSVData` data reader\n", + "\n", + "Previously, it was shown that `CSVData` may automatically be detected using `Data` or can be manually specified by the user:\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "data = dp.Data(filepath)\n", + "data = CSVData(filepath)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25f5130-4f19-40c5-9d13-549a04f1aef5", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/diamonds.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "\n", + "for file in csv_files:\n", + " data = CSVData(os.path.join(data_folder, file))\n", + " print(data.data.head())\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "8940de56-1417-4bf6-af87-9d4d00b9a631", + "metadata": {}, + "source": [ + "## CSVData Options\n", + "\n", + "As mentioned preivously, `CSVData` has options that can be set to finetune its detection or to ensure the data is being read in a specific manner.\n", + "The options for `CSVData` are detailed below:\n", + "\n", + " * delimiter - delimiter used to decipher the csv input file\n", + " * quotechar - quote character used in the delimited file\n", + " * header - location of the header in the file.\n", + " * data_format - user selected format in which to return data can only be of specified types\n", + " * selected_columns - columns being selected from the entire dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d74f2e8-0ec3-4e93-8778-0a5f013e0cdb", + "metadata": {}, + "outputs": [], + "source": [ + "# options are set via a dictionary object in which the parameters are specified.\n", + "# these are the default values for each option\n", + "options = {\n", + " \"delimiter\": \",\",\n", + " \"quotechar\": '\"',\n", + " \"header\": 'auto',\n", + " \"data_format\": \"dataframe\", # type: str, choices: \"dataframe\", \"records\"\n", + " \"selected_columns\": list(),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9af108a1-ffe6-4c3a-82cc-833b1a3b57a1", + "metadata": {}, + "source": [ + "## Options: delimiter and quotechar\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `delimiter` and `quotechar`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570e20c3-198e-4356-98d3-92eb9655ef4e", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98385148-861e-4eb1-ba8d-e93120515401", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(filepath) # or use CSVData\n", + "print('Auto detected')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5d9306-d90a-4fc6-85a7-a0d535fe2d80", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'delimiter': '@', 'quotechar': \"'\"}\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('manually set')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7bfa60f-b5b9-48a5-adc5-3937aed145da", + "metadata": {}, + "outputs": [], + "source": [ + "# intentional failure with incorrect options\n", + "options = {'delimiter': ',', 'quotechar': '\"'}\n", + "\n", + "# will be interepted as TextData because the delimtier and quotechar were incorrect\n", + "data = dp.Data(filepath, options=options)\n", + "print('intentional faliure set')\n", + "print('=' * 80)\n", + "try:\n", + " print('delimiter: ', data.delimiter) # attribute error raised here, bc TextData, not CSVData\n", + " print('quotechar: ', data.quotechar)\n", + " \n", + " # should not reach this or something went wrong\n", + " raise Exception('Should have failed because this is detected as TextData.')\n", + "except AttributeError:\n", + " print('When data_type is not set or the CSVData is not set, it will fail over to the\\n'\n", + " 'next best reader. In this case it is \"TextData\"\\n')\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "eeb41c7c-8319-40a3-9d87-88edbb3c5290", + "metadata": {}, + "source": [ + "## Options: header\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `header`.\n", + "\n", + "Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16a927ef-1ba8-4bf2-ae40-2a9909030609", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701d7bf-2de0-4dce-8f09-7f0cddd1132c", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 'auto'} # auto detected (default value)\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8642a0a-367a-44c6-b611-b89d97b29f85", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 2} # intentionally set incorrectly at value 2\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d6e3f640-c809-4eb6-9571-30065821615e", + "metadata": {}, + "source": [ + "## Options: data_format\n", + "\n", + "For CSVData, the `data_format` option can have the following values:\n", + "\n", + " * dataframe - (default) loads the dataset as a pandas.DataFrame\n", + " * records - loads the data as rows of text values, the extra parameter `record_samples_per_line` how many rows are combined into a single line\n", + " \n", + "`dataframe` is used for conducting **structured profiling** of the dataset while `records` is for **unstructured profiling**.\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `data_format`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146109ea-a554-4766-bb19-78c116d2a8dd", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dceac967-d326-4064-ba1c-87a1146c9d72", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'dataframe'} # default\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "data.data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c25524f-ef23-4e06-9023-842c64c2640e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'records', 'record_samples_per_line': 1}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "d45f3ed6-ddcd-4bf3-95bc-09f23eb94c97", + "metadata": {}, + "source": [ + "## Options: selected columns\n", + "\n", + "By default, all columns of a dataset will be read and loaded into the data reader. However, `selected_columns` can be set to only load columns which the user requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9b45e18-93c6-42e6-b978-af51574307eb", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "018f3f4d-32ac-411a-9918-bae78aff0b0e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/nbsphinx/labeler.ipynb b/docs/0.7.1/doctrees/nbsphinx/labeler.ipynb new file mode 100644 index 000000000..c0d0bf359 --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/labeler.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "spoken-reunion", + "metadata": {}, + "source": [ + "# Sensitive Data Detection with the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "interesting-bidder", + "metadata": {}, + "source": [ + "In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.\n", + "\n", + "First, let's dive into what the Labeler is." + ] + }, + { + "cell_type": "markdown", + "id": "1965b83b", + "metadata": {}, + "source": [ + "## What is the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "388c643f", + "metadata": {}, + "source": [ + "The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor." + ] + }, + { + "cell_type": "markdown", + "id": "e5d0aeb4", + "metadata": {}, + "source": [ + "![alt text](DL-Flowchart.png \"Title\")" + ] + }, + { + "cell_type": "markdown", + "id": "550323c7", + "metadata": {}, + "source": [ + "Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results. \n", + "\n", + "Now let's run some examples. Start by importing all the requirements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "scientific-stevens", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "5125b215", + "metadata": {}, + "source": [ + "## Structured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "wicked-devon", + "metadata": {}, + "source": [ + "We'll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, [found here!](https://data.ed.gov/dataset/college-scorecard-all-data-files-through-6-2020/resources?resource=823ac095-bdfc-41b0-b508-4e8fc3110082)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adjusted-native", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ab6ccf8a", + "metadata": {}, + "source": [ + "We can directly predict the labels of a structured dataset on the cell level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19529af4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "\n", + "# print out the labels and label mapping\n", + "print(\"Labels: {}\".format(labeler.labels)) \n", + "print(\"\\n\")\n", + "print(\"Label Mapping: {}\".format(labeler.label_mapping))\n", + "print(\"\\n\")\n", + "\n", + "# make predictions and get labels for each cell going row by row\n", + "# predict options are model dependent and the default model can show prediction confidences\n", + "predictions = labeler.predict(data, predict_options={\"show_confidences\": True})\n", + "\n", + "# display prediction results\n", + "print(\"Predictions: {}\".format(predictions['pred']))\n", + "print(\"\\n\")\n", + "\n", + "# display confidence results\n", + "print(\"Confidences: {}\".format(predictions['conf']))" + ] + }, + { + "cell_type": "markdown", + "id": "2af72e2c", + "metadata": {}, + "source": [ + "The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secret-million", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# set options to only run the labeler\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report() \n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "fatty-louisville", + "metadata": {}, + "source": [ + "In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns." + ] + }, + { + "cell_type": "markdown", + "id": "unavailable-diploma", + "metadata": {}, + "source": [ + "## Unstructured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "metallic-coaching", + "metadata": {}, + "source": [ + "Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unauthorized-lounge", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# load data\n", + "data = \"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"\n", + "\n", + "# convert string data to list to feed into the labeler\n", + "data = [data]" + ] + }, + { + "cell_type": "markdown", + "id": "concerned-segment", + "metadata": {}, + "source": [ + "By default, the Labeler predicts the results at the character level for unstructured text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "junior-acrobat", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='unstructured')\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print(predictions['pred'])" + ] + }, + { + "cell_type": "markdown", + "id": "individual-diabetes", + "metadata": {}, + "source": [ + "In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optical-universe", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# convert prediction to word format and ner format\n", + "# Set the output to the NER format (start position, end position, label)\n", + "labeler.set_params(\n", + " { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } \n", + ")\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print('\\n')\n", + "print('=======================Prediction======================\\n')\n", + "for pred in predictions['pred'][0]:\n", + " print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))\n", + " print('--------------------------------------------------------')" + ] + }, + { + "cell_type": "markdown", + "id": "behavioral-tourism", + "metadata": {}, + "source": [ + "Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample. " + ] + }, + { + "cell_type": "markdown", + "id": "nasty-disney", + "metadata": {}, + "source": [ + "## Train the Labeler from Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "destroyed-twist", + "metadata": {}, + "source": [ + "The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity's sake, let's only train a few epochs with a subset of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "utility-evaluation", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "df.head()\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df = df.sample(frac=1).reset_index(drop=True)\n", + "data_train = df[:int((1 - split_ratio) * len(df))]\n", + "data_test = df[int((1 - split_ratio) * len(df)):]\n", + "\n", + "# train a new labeler with column names as labels\n", + "if not os.path.exists('data_labeler_saved'):\n", + " os.makedirs('data_labeler_saved')\n", + "\n", + "labeler = dp.train_structured_labeler(\n", + " data=data_train,\n", + " save_dirpath=\"data_labeler_saved\",\n", + " epochs=10,\n", + " default_label=\"OPEID6\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "utility-torture", + "metadata": {}, + "source": [ + "The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "answering-panel", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# predict with the labeler object\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "polish-stand", + "metadata": {}, + "source": [ + "Another way to use the trained Labeler is through the directory path of the saved labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "industrial-characterization", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# predict with the labeler loaded from path\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "2acedba0", + "metadata": {}, + "source": [ + "## Transfer Learning a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "2f15fb1f", + "metadata": {}, + "source": [ + "Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity's sake, let's only train a few epochs with a small dataset at the cost of accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0104c374", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "\n", + "\n", + "# prep data\n", + "df_data = df_data.reset_index(drop=True).melt()\n", + "df_data.columns = [1, 0] # labels=1, values=0 in that order\n", + "df_data = df_data.astype(str)\n", + "new_labels = df_data[1].unique().tolist()\n", + "\n", + "# load structured Labeler w/ trainable set to True\n", + "labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "\n", + "# Reconstruct the model to add each new label\n", + "for label in new_labels:\n", + " labeler.add_label(label)\n", + "\n", + "# this will use transfer learning to retrain the labeler on your new\n", + "# dataset and labels.\n", + "# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones\n", + "# Setting the reset_weights parameter to false allows transfer learning to occur\n", + "model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2, \n", + " epochs=10, labels=None, reset_weights=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ae78745f", + "metadata": {}, + "source": [ + "Let's display the training results of the last epoch:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b764aa8c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"{:16s} Precision Recall F1-score Support\".format(\"\"))\n", + "for item in model_results[-1][2]:\n", + " print(\"{:16s} {:4.3f} {:4.3f} {:4.3f} {:7.0f}\".format(item,\n", + " model_results[-1][2][item][\"precision\"],\n", + " model_results[-1][2][item][\"recall\"],\n", + " model_results[-1][2][item][\"f1-score\"],\n", + " model_results[-1][2][item][\"support\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "44009522", + "metadata": {}, + "source": [ + "It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity's sake and that real training would involve more samples and better results." + ] + }, + { + "cell_type": "markdown", + "id": "e110ee1c", + "metadata": {}, + "source": [ + "## Saving and Loading a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "c484d193", + "metadata": {}, + "source": [ + "The Labeler can easily be saved or loaded with one simple line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8684fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure save directory exists\n", + "if not os.path.exists('my_labeler'):\n", + " os.makedirs('my_labeler')\n", + "\n", + "# Saving the labeler\n", + "labeler.save_to_disk(\"my_labeler\")\n", + "\n", + "# Loading the labeler\n", + "labeler = dp.DataLabeler(labeler_type='structured', dirpath=\"my_labeler\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d36dec8", + "metadata": {}, + "source": [ + "## Building a Labeler from the Ground Up" + ] + }, + { + "cell_type": "markdown", + "id": "59346d2b", + "metadata": {}, + "source": [ + "As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6506ef97", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "from dataprofiler.labelers.character_level_cnn_model import \\\n", + " CharacterLevelCnnModel\n", + "from dataprofiler.labelers.data_processing import \\\n", + " StructCharPreprocessor, StructCharPostprocessor\n", + "\n", + "model = CharacterLevelCnnModel({\"PAD\":0, \"UNKNOWN\":1, \"Test_Label\":2})\n", + "preprocessor = StructCharPreprocessor()\n", + "postprocessor = StructCharPostprocessor()\n", + "\n", + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "labeler.set_preprocessor(preprocessor)\n", + "labeler.set_model(model)\n", + "labeler.set_postprocessor(postprocessor)\n", + "\n", + "# check for basic compatibility between the processors and the model\n", + "labeler.check_pipeline()\n", + "\n", + "# Optionally set the parameters\n", + "parameters={\n", + " 'preprocessor':{\n", + " 'max_length': 100,\n", + " },\n", + " 'model':{\n", + " 'max_length': 100,\n", + " },\n", + " 'postprocessor':{\n", + " 'random_state': random.Random(1)\n", + " }\n", + "} \n", + "labeler.set_params(parameters)\n", + "\n", + "labeler.help()" + ] + }, + { + "cell_type": "markdown", + "id": "5f020d7f", + "metadata": {}, + "source": [ + "The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the [documentation]( https://capitalone.github.io/dataprofiler). In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and character levels. Users can also train the labeler on their own datasets." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/nbsphinx/overview.ipynb b/docs/0.7.1/doctrees/nbsphinx/overview.ipynb new file mode 100644 index 000000000..dafec60ab --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/overview.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc2826d9", + "metadata": {}, + "source": [ + "# Data Profiler - What's in your data?" + ] + }, + { + "cell_type": "markdown", + "id": "b997522b", + "metadata": {}, + "source": [ + "This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large datasets, or merge multiple profiles. In particular, this example covers the followings:\n", + "\n", + " - Basic usage of the Data Profiler\n", + " - The data reader class\n", + " - Profiler options\n", + " - Updating profiles and merging profiles\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef404c84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "id": "f51971e3", + "metadata": {}, + "source": [ + "## Basic Usage of the Data Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "639e66d3", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5379c45c", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "print(data.data.head())\n", + "\n", + "# run data profiler and get the report\n", + "profile = dp.Profiler(data)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "57fe2827", + "metadata": {}, + "source": [ + "The report includes `global_stats` and `data_stats` for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the `compact` format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity level from the Data Labeler component or histogram results, the format `pretty` should be used." + ] + }, + { + "cell_type": "markdown", + "id": "74027cfd", + "metadata": {}, + "source": [ + "## Data reader class" + ] + }, + { + "cell_type": "markdown", + "id": "41364888", + "metadata": {}, + "source": [ + "DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823829f4", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "\n", + "all_files = {\n", + " \"csv\": csv_files,\n", + " \"json\": json_files,\n", + " \"parquet\": parquet_files,\n", + " \"avro\": avro_files,\n", + " \"text\": text_files\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " print(file_type)\n", + " for file in all_files[file_type]:\n", + " data = dp.Data(os.path.join(data_path, file))\n", + " print(\"{:<85} {:<15}\".format(file, data.data_type))\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9d7e02", + "metadata": {}, + "source": [ + "The `Data` class detects the file type and uses one of the following classes: `CSVData`, `JSONData`, `ParquetData`, `AVROData`, `TextData`. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831e68a3", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# use individual data reader classes\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "from dataprofiler.data_readers.json_data import JSONData\n", + "from dataprofiler.data_readers.parquet_data import ParquetData\n", + "from dataprofiler.data_readers.avro_data import AVROData\n", + "from dataprofiler.data_readers.text_data import TextData\n", + "\n", + "csv_files = \"csv/aws_honeypot_marx_geo.csv\"\n", + "json_files = \"json/complex_nested.json\"\n", + "parquet_files = \"parquet/nation.dict.parquet\"\n", + "avro_files = \"avro/userdata1.avro\"\n", + "text_files = \"txt/discussion_reddit.txt\"\n", + "\n", + "all_files = {\n", + " \"csv\": [csv_files, CSVData],\n", + " \"json\": [json_files, JSONData],\n", + " \"parquet\": [parquet_files, ParquetData],\n", + " \"avro\": [avro_files, AVROData],\n", + " \"text\": [text_files, TextData],\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " file, data_reader = all_files[file_type]\n", + " data = data_reader(os.path.join(data_path, file))\n", + " print(\"File name {}\\n\".format(file))\n", + " if file_type == \"text\":\n", + " print(data.data[0][:1000]) # print the first 1000 characters\n", + " else:\n", + " print(data.data)\n", + " print('===============================================================================')" + ] + }, + { + "cell_type": "markdown", + "id": "572df0a8", + "metadata": {}, + "source": [ + "In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df87ab83", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "84a06312", + "metadata": {}, + "source": [ + "## Structured Profiler vs. Unstructured Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "4c0ea925", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4565d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Using the structured profiler\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))\n", + "\n", + "# Using the unstructured profiler\n", + "my_dataframe = pd.DataFrame([[\"Sample1\"],[\"Sample2\"],[\"Sample3\"]], columns=[\"Text_Samples\"])\n", + "profile = dp.Profiler(my_dataframe, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b16648ba", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "8b0cc8ad", + "metadata": {}, + "source": [ + "The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let's remove the histogram and data labeler component while running Data Profiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbac3a2c", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"histogram_and_quantiles.is_enabled\": False,\n", + " \"data_labeler.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "590ca50b", + "metadata": {}, + "source": [ + "Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ed21bc1", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.structured_options.data_labeler.sample_size = 1\n", + "profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = \"rice\"\n", + "# An error will raise if the options are set incorrectly.\n", + "profile_options.validate()\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "9f690616", + "metadata": {}, + "source": [ + "## Update profiles" + ] + }, + { + "cell_type": "markdown", + "id": "965f8c85", + "metadata": {}, + "source": [ + "One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially. \n", + "\n", + "After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in `global_stats` of the profiles including `column_count`, `row_count`, `row_is_null_ratio`, `duplicate_row_count`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ac4346", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# read the input data and devide it into two equal halves\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "df = data.data\n", + "df1 = df.iloc[:int(len(df)/2)]\n", + "df2 = df.iloc[int(len(df)/2):]\n", + "\n", + "# Update the profile with the first half\n", + "profile = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile.update_profile(df2)\n", + "\n", + "# Update profile with the full dataset\n", + "profile_full = dp.Profiler(df)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "report_full = profile_full.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b41ee2bf", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same whether they are broken into several updates or not." + ] + }, + { + "cell_type": "markdown", + "id": "c547f051", + "metadata": {}, + "source": [ + "## Merge profiles" + ] + }, + { + "cell_type": "markdown", + "id": "a5292962", + "metadata": {}, + "source": [ + "In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.\n", + "\n", + "As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565b8d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Update the profile with the first half\n", + "profile1 = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile2 = dp.Profiler(df2)\n", + "\n", + "# merge profiles\n", + "profile_merge = profile1 + profile2\n", + "\n", + "# check results of the merged profile\n", + "report_merge = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report_merge, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b77fac3f", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same!" + ] + }, + { + "cell_type": "markdown", + "id": "c644ee42", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/nbsphinx/profiler_example.ipynb b/docs/0.7.1/doctrees/nbsphinx/profiler_example.ipynb new file mode 100644 index 000000000..cc4ecc218 --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/profiler_example.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Structured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the structured data types for structured profiling. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. `data_stats` contains specific properties and statistics for each column file such as min, max, mean, variance, etc.\n", + "\n", + "For structured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"column_count\": int,\n", + " \"row_count\": int,\n", + " \"row_has_null_ratio\": float,\n", + " \"row_is_null_ratio\": float, \n", + " \"unique_row_ratio\": float,\n", + " \"duplicate_row_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string,\n", + "},\n", + "\"data_stats\": [\n", + " {\n", + " \"column_name\": string,\n", + " \"data_type\": string,\n", + " \"data_label\": string,\n", + " \"categorical\": bool,\n", + " \"order\": string,\n", + " \"samples\": list(str),\n", + " \"statistics\": {\n", + " \"sample_size\": int,\n", + " \"null_count\": int,\n", + " \"null_types\": list(string),\n", + " \"null_types_index\": {\n", + " string: list(int)\n", + " },\n", + " \"data_type_representation\": [string, list(string)],\n", + " \"min\": [null, float],\n", + " \"max\": [null, float],\n", + " \"mean\": float,\n", + " \"variance\": float,\n", + " \"stddev\": float,\n", + " \"histogram\": { \n", + " \"bin_counts\": list(int),\n", + " \"bin_edges\": list(float),\n", + " },\n", + " \"quantiles\": {\n", + " int: float\n", + " }\n", + " \"vocab\": list(char),\n", + " \"avg_predictions\": dict(float), \n", + " \"data_label_representation\": dict(float),\n", + " \"categories\": list(str),\n", + " \"unique_count\": int,\n", + " \"unique_ratio\": float,\n", + " \"precision\": {\n", + " 'min': int,\n", + " 'max': int,\n", + " 'mean': float,\n", + " 'var': float,\n", + " 'std': float,\n", + " 'sample_size': int,\n", + " 'margin_of_error': float,\n", + " 'confidence_level': float\t\t\n", + " },\n", + " \"times\": dict(float),\n", + " \"format\": string\n", + " }\n", + " }\n", + "]\n", + "```\n", + "\n", + "In the example, the `compact` format of the report is used to shorten the full list of the results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Compact - A high level view, good for quick reviews\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format `pretty` should be used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "241f6e3e", + "metadata": {}, + "source": [ + "# Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "5b20879b", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44eb47", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "# print the report using json to prettify.\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.\n", + "\n", + "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/diamonds.csv\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"histogram.is_enabled\": False, \"int.is_enabled\": False})\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.structured_options.data_labeler.max_sample_size = 1000\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "# new_data = dp.Data(os.path.join(data_path, \"iris-utf-16.csv\")) # will error due to schema mismatch\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"csv/names-col.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"csv/sparse-first-and-last-column-header-and-author.txt\",\n", + " \"csv/sparse-first-and-last-column-skip-header.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/nbsphinx/unstructured_profiler_example.ipynb b/docs/0.7.1/doctrees/nbsphinx/unstructured_profiler_example.ipynb new file mode 100644 index 000000000..e738cf3a7 --- /dev/null +++ b/docs/0.7.1/doctrees/nbsphinx/unstructured_profiler_example.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Unstructured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as samples used and file encoding. `data_stats` contains specific properties and statistics for each text sample.\n", + "\n", + "For unstructured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"empty_line_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string\n", + "},\n", + "\"data_stats\": {\n", + " \"data_label\": {\n", + " \"entity_counts\": {\n", + " \"word_level\": dict(int),\n", + " \"true_char_level\": dict(int),\n", + " \"postprocess_char_level\": dict(int)\n", + " },\n", + " \"times\": dict(float)\n", + " },\n", + " \"statistics\": {\n", + " \"vocab\": list(char),\n", + " \"words\": list(string),\n", + " \"word_count\": dict(int),\n", + " \"times\": dict(float)\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "4d183992", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "data = dp.Data(os.path.join(data_path, \"csv/SchoolDataSmall.csv\"), options={\"data_format\": \"records\"})\n", + "profile = dp.Profiler(data, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require vocab count information they may desire to turn off the word count functionality.\n", + "\n", + "Below, let's remove the vocab count and set the stop words. \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"*.vocab.is_enabled\": False, \"*.is_case_sensitive\": True })\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.unstructured_options.text.stop_words = [\"These\", \"are\", \"stop\", \"words\"]\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"txt/sentence-3x.txt\",\n", + " \"txt/sentence.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "print(data_objects)\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " report = profile.report(report_options={\"output_format\":\"compact\"})\n", + " print(json.dumps(report, indent=4))\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/doctrees/overview.doctree b/docs/0.7.1/doctrees/overview.doctree new file mode 100644 index 000000000..884e65de7 Binary files /dev/null and b/docs/0.7.1/doctrees/overview.doctree differ diff --git a/docs/0.7.1/doctrees/profiler.doctree b/docs/0.7.1/doctrees/profiler.doctree new file mode 100644 index 000000000..17ddddbfc Binary files /dev/null and b/docs/0.7.1/doctrees/profiler.doctree differ diff --git a/docs/0.7.1/doctrees/profiler_example.doctree b/docs/0.7.1/doctrees/profiler_example.doctree new file mode 100644 index 000000000..f5f7dba84 Binary files /dev/null and b/docs/0.7.1/doctrees/profiler_example.doctree differ diff --git a/docs/0.7.1/doctrees/unstructured_profiler_example.doctree b/docs/0.7.1/doctrees/unstructured_profiler_example.doctree new file mode 100644 index 000000000..21f083b4d Binary files /dev/null and b/docs/0.7.1/doctrees/unstructured_profiler_example.doctree differ diff --git a/docs/0.7.1/html/.buildinfo b/docs/0.7.1/html/.buildinfo new file mode 100644 index 000000000..f452200c4 --- /dev/null +++ b/docs/0.7.1/html/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 8eda56380f3179102c2828ac5714946a +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/0.7.1/html/API.html b/docs/0.7.1/html/API.html new file mode 100644 index 000000000..9ce0597ff --- /dev/null +++ b/docs/0.7.1/html/API.html @@ -0,0 +1,282 @@ + + + + + + + + + API - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/_images/DL-Flowchart.png b/docs/0.7.1/html/_images/DL-Flowchart.png new file mode 100644 index 000000000..696eeb5dc Binary files /dev/null and b/docs/0.7.1/html/_images/DL-Flowchart.png differ diff --git a/docs/0.7.1/html/_images/histogram_example_0.png b/docs/0.7.1/html/_images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/docs/0.7.1/html/_images/histogram_example_0.png differ diff --git a/docs/0.7.1/html/_images/histogram_example_1.png b/docs/0.7.1/html/_images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/docs/0.7.1/html/_images/histogram_example_1.png differ diff --git a/docs/0.7.1/html/_images/histogram_example_2.png b/docs/0.7.1/html/_images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/docs/0.7.1/html/_images/histogram_example_2.png differ diff --git a/docs/0.7.1/html/_sources/API.rst.txt b/docs/0.7.1/html/_sources/API.rst.txt new file mode 100644 index 000000000..fdbf2242b --- /dev/null +++ b/docs/0.7.1/html/_sources/API.rst.txt @@ -0,0 +1,16 @@ +.. _API: + +API +*** + +The API is split into 4 main components: Profilers, Labelers, Data Readers, and +Validators. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + dataprofiler.data_readers + dataprofiler.profilers + dataprofiler.labelers + dataprofiler.validators \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/add_new_model_to_data_labeler.nblink.txt b/docs/0.7.1/html/_sources/add_new_model_to_data_labeler.nblink.txt new file mode 100644 index 000000000..130e413fc --- /dev/null +++ b/docs/0.7.1/html/_sources/add_new_model_to_data_labeler.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/add_new_model_to_data_labeler.ipynb" +} \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/data_labeling.rst.txt b/docs/0.7.1/html/_sources/data_labeling.rst.txt new file mode 100644 index 000000000..db76fe791 --- /dev/null +++ b/docs/0.7.1/html/_sources/data_labeling.rst.txt @@ -0,0 +1,365 @@ +.. _data_labeling: + +Labeler (Sensitive Data) +************************ + +In this library, the term *data labeling* refers to entity recognition. + +Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well. + +*Data Labels* are determined per cell for structured data (column/row when +the *profiler* is used) or at the character level for unstructured data. This +is a list of the default labels. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Identify Entities in Structured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Makes predictions and identifying labels: + +.. code-block:: python + + import dataprofiler as dp + + # load data and data labeler + data = dp.Data("your_data.csv") + data_labeler = dp.DataLabeler(labeler_type='structured') + + # make predictions and get labels per cell + predictions = data_labeler.predict(data) + +Identify Entities in Unstructured Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Predict which class characters belong to in unstructured text: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured') + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Prediction what class each character belongs to + model_predictions = data_labeler.predict( + sample, predict_options=dict(show_confidences=True)) + + # Predictions / confidences are at the character level + final_results = model_predictions["pred"] + final_confidences = model_predictions["conf"] + +It's also possible to change output formats, output similar to a **SpaCy** format: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True) + + # Example sample string, must be in an array (multiple arrays can be passed) + sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234." + "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"] + + # Set the output to the NER format (start position, end position, label) + data_labeler.set_params( + { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } + ) + + results = data_labeler.predict(sample) + + print(results) + +Train a New Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for training your own data labeler on their own set of structured data +(tabular): + +.. code-block:: python + + import dataprofiler as dp + + # Will need one column with a default label of UNKNOWN + data = dp.Data("your_file.csv") + + data_labeler = dp.train_structured_labeler( + data=data, + save_dirpath="/path/to/save/labeler", + epochs=2 + ) + + data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse + +Load an Existing Data Labeler +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Mechanism for loading an existing data_labeler: + +.. code-block:: python + + import dataprofiler as dp + + data_labeler = dp.DataLabeler( + labeler_type='structured', dirpath="/path/to/my/labeler") + + # get information about the parameters/inputs/output formats for the DataLabeler + data_labeler.help() + +Extending a Data Labeler with Transfer Learning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, **a labeler loaded will not be trainable**. In order to load a +trainable DataLabeler, the user must set `trainable=True` or load a labeler +using the `TrainableDataLabeler` class. + +The following illustrates how to change the labels: + +.. code-block:: python + + import dataprofiler as dp + + labels = ['label1', 'label2', ...] # new label set can also be an encoding dict + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will use transfer learning to retrain the data labeler on your new + # dataset and labels. + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2, labels=labels) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + +The following illustrates how to extend the labels: + +.. code-block:: python + + import dataprofiler as dp + + new_labels = ['label1', 'label2', ...] + data = dp.Data("your_file.csv") # contains data with new labels + + # load default structured Data Labeler w/ trainable set to True + data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True) + + # this will maintain current labels and model weights, but extend the model's + # labels + for label in new_labels: + data_labeler.add_label(label) + + # NOTE: a user can also add a label which maps to the same index as an existing + # label + # data_labeler.add_label(label, same_as='') + + # For a trainable model, the user must then train the model to be able to + # continue using the labeler since the model's graph has likely changed + # NOTE: data must be in an acceptable format for the preprocessor to interpret. + # please refer to the preprocessor/model for the expected data format. + # Currently, the DataLabeler cannot take in Tabular data, but requires + # data to be ingested with two columns [X, y] where X is the samples and + # y is the labels. + model_results = data_labeler.fit(x=data['samples'], y=data['labels'], + validation_split=0.2, epochs=2) + + # final_results, final_confidences are a list of results for each epoch + epoch_id = 0 + final_results = model_results[epoch_id]["pred"] + final_confidences = model_results[epoch_id]["conf"] + + +Changing pipeline parameters: + +.. code-block:: python + + import dataprofiler as dp + + # load default Data Labeler + data_labeler = dp.DataLabeler(labeler_type='structured') + + # change parameters of specific component + data_labeler.preprocessor.set_params({'param1': 'value1'}) + + # change multiple simultaneously. + data_labeler.set_params({ + 'preprocessor': {'param1': 'value1'}, + 'model': {'param2': 'value2'}, + 'postprocessor': {'param3': 'value3'} + }) + + +Build Your Own Data Labeler +=========================== + +The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused. + +Given a set of the 3 components, you can construct your own DataLabeler: + +.. code-block:: python + from dataprofiler.labelers.base_data_labeler import BaseDataLabeler, \ + TrainableDataLabeler + from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + # load a non-trainable data labeler + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = BaseDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + + # load trainable data labeler + data_labeler = TrainableDataLabeler.load_with_components( + preprocessor=preprocessor, model=model, postprocessor=postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + +Option for swapping out specific components of an existing labeler. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.labelers.character_level_cnn_model import \ + CharacterLevelCnnModel + from dataprofiler.labelers.data_processing import \ + StructCharPreprocessor, StructCharPostprocessor + + model = CharacterLevelCnnModel(...) + preprocessor = StructCharPreprocessor(...) + postprocessor = StructCharPostprocessor(...) + + data_labeler = dp.DataLabeler(labeler_type='structured') + data_labeler.set_preprocessor(preprocessor) + data_labeler.set_model(model) + data_labeler.set_postprocessor(postprocessor) + + # check for basic compatibility between the processors and the model + data_labeler.check_pipeline() + + +Model Component +~~~~~~~~~~~~~~~ + +In order to create your own model component for data labeling, you can utilize +the `BaseModel` class from `dataprofiler.labelers.base_model` and +overriding the abstract class methods. + +Reviewing `CharacterLevelCnnModel` from +`dataprofiler.labelers.character_level_cnn_model` illustrates the functions +which need an override. + +#. `__init__`: specifying default parameters and calling base `__init__` +#. `_validate_parameters`: validating parameters given by user during setting +#. `_need_to_reconstruct_model`: flag for when to reconstruct a model (i.e. + parameters change or labels change require a model reconstruction) +#. `_construct_model`: initial construction of the model given the parameters +#. `_reconstruct_model`: updates model architecture for new label set while + maintaining current model weights +#. `fit`: mechanism for the model to learn given training data +#. `predict`: mechanism for model to make predictions on data +#. `details`: prints a summary of the model construction +#. `save_to_disk`: saves model and model parameters to disk +#. `load_from_disk`: loads model given a path on disk + + +Preprocessor Component +~~~~~~~~~~~~~~~~~~~~~~ + +In order to create your own preprocessor component for data labeling, you can +utilize the `BaseDataPreprocessor` class +from `dataprofiler.labelers.data_processing` and override the abstract class +methods. + +Reviewing `StructCharPreprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the user data and converts it into an digestible, + iterable format for the model +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor + +Postprocessor Component +~~~~~~~~~~~~~~~~~~~~~~~ + +The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +`BaseDataPostprocessor` class from `dataprofiler.labelers.data_processing` +and override the abstract class methods. + +Reviewing `StructCharPostprocessor` from +`dataprofiler.labelers.data_processing` illustrates the functions which +need an override. + +#. `__init__`: passing parameters to the base class and executing any + extraneous calculations to be saved as parameters +#. `_validate_parameters`: validating parameters given by user during + setting +#. `process`: takes in the output of the model and processes for output to + the user +#. `set_params` (optional): if a parameter requires processing before setting, + a user can override this function to assist with setting the parameter +#. `_save_processor` (optional): if a parameter is not JSON serializable, a + user can override this function to assist in saving the processor and its + parameters +#. `load_from_disk` (optional): if a parameter(s) is not JSON serializable, a + user can override this function to assist in loading the processor diff --git a/docs/0.7.1/html/_sources/data_reader.nblink.txt b/docs/0.7.1/html/_sources/data_reader.nblink.txt new file mode 100644 index 000000000..4722970da --- /dev/null +++ b/docs/0.7.1/html/_sources/data_reader.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/data_readers.ipynb" +} \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/data_readers.rst.txt b/docs/0.7.1/html/_sources/data_readers.rst.txt new file mode 100644 index 000000000..45e815b63 --- /dev/null +++ b/docs/0.7.1/html/_sources/data_readers.rst.txt @@ -0,0 +1,139 @@ +.. _data_readers: + +Data Readers +************ + +The `Data` class itself will identify then output one of the following `Data` class types. +Using the data reader is easy, just pass it through the Data object. + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("your_file.csv") + +The supported file types are: + +* CSV file (or any delimited file) +* JSON object +* Avro file +* Parquet file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + +It's also possible to specifically call one of the data classes such as the following command: + +.. code-block:: python + + from dataprofiler.data_readers.csv_data import CSVData + data = CSVData("your_file.csv", options={"delimiter": ","}) + +Additionally any of the data classes can be loaded using a URL: + +.. code-block:: python + + import dataprofiler as dp + data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"}) + +Below are descriptions of the various `Data` classes and the available options. + +CSVData +======= + +Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter. + +`CSVData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* delimiter - Must be a string, for example `"delimiter": ","` +* data_format - must be a string, possible choices: "dataframe", "records" +* selected_columns - columns being selected from the entire dataset, must be a + list `["column 1", "ssn"]` +* header - Define the header, for example + + * `"header": 'auto'` for auto detection + * `"header": None` for no header + * `"header": ` to specify the header row (0 based index) + +JSONData +======== + +Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the "data" property, the "metadata" property, and the +"data_and_metadata" property. + +`JSONData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` +* payload_keys - The dictionary keys for the payload of the JSON, typically called "data" + or "payload". Defaults to ["data", "payload", "response"]. + + +AVROData +======== + +Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter. + +`AVROData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "avro", "json", "flattened_dataframe" + + * "flattened_dataframe" is best used for AVROs with a JSON structure typically found in data streams that contain + nested lists of dictionaries and a payload. For example: `{"data": [ columns ], "response": 200}` +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` + +ParquetData +=========== + +Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter. + +`ParquetData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format - must be a string, choices: "dataframe", "records", "json" +* selected_keys - columns being selected from the entire dataset, must be a list `["column 1", "ssn"]` + +TextData +======== + +Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. + +`TextData(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* data_format: user selected format in which to return data. Currently only supports "text". +* samples_per_line - chunks by which to read in the specified dataset + +Data Using a URL +================ + +Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter. + +`Data(input_file_path=None, data=None, options=None)` + +Possible `options`: + +* verify_ssl: must be a boolean string, choices: "True", "False". Set to "True" by default. \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.avro_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.avro_data.rst.txt new file mode 100644 index 000000000..d3227df29 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.avro_data.rst.txt @@ -0,0 +1,7 @@ +Avro Data +========= + +.. automodule:: dataprofiler.data_readers.avro_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.base_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.base_data.rst.txt new file mode 100644 index 000000000..b82883cb9 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.base_data.rst.txt @@ -0,0 +1,7 @@ +Base Data +========= + +.. automodule:: dataprofiler.data_readers.base_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.csv_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.csv_data.rst.txt new file mode 100644 index 000000000..85a625d69 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.csv_data.rst.txt @@ -0,0 +1,7 @@ +CSV Data +======== + +.. automodule:: dataprofiler.data_readers.csv_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.data.rst.txt new file mode 100644 index 000000000..813e81805 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.data.rst.txt @@ -0,0 +1,7 @@ +Data +==== + +.. automodule:: dataprofiler.data_readers.data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.data_utils.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.data_utils.rst.txt new file mode 100644 index 000000000..309208b73 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.data_utils.rst.txt @@ -0,0 +1,7 @@ +Data Utils +========== + +.. automodule:: dataprofiler.data_readers.data_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt new file mode 100644 index 000000000..89e78cc2d --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.filepath_or_buffer.rst.txt @@ -0,0 +1,7 @@ +Filepath Or Buffer +================== + +.. automodule:: dataprofiler.data_readers.filepath_or_buffer + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.json_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.json_data.rst.txt new file mode 100644 index 000000000..ae0a51d13 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.json_data.rst.txt @@ -0,0 +1,7 @@ +JSON Data +========= + +.. automodule:: dataprofiler.data_readers.json_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt new file mode 100644 index 000000000..dfdcbe4bb --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.parquet_data.rst.txt @@ -0,0 +1,7 @@ +Parquet Data +============ + +.. automodule:: dataprofiler.data_readers.parquet_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.rst.txt new file mode 100644 index 000000000..80324c993 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.rst.txt @@ -0,0 +1,29 @@ +Data Readers +============ + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.data_readers.avro_data + dataprofiler.data_readers.base_data + dataprofiler.data_readers.csv_data + dataprofiler.data_readers.data + dataprofiler.data_readers.data_utils + dataprofiler.data_readers.filepath_or_buffer + dataprofiler.data_readers.json_data + dataprofiler.data_readers.parquet_data + dataprofiler.data_readers.structured_mixins + dataprofiler.data_readers.text_data + +.. automodule:: dataprofiler.data_readers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt new file mode 100644 index 000000000..157e03d2c --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.structured_mixins.rst.txt @@ -0,0 +1,7 @@ +Structured Mixins +================= + +.. automodule:: dataprofiler.data_readers.structured_mixins + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.data_readers.text_data.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.data_readers.text_data.rst.txt new file mode 100644 index 000000000..6ac6b9648 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.data_readers.text_data.rst.txt @@ -0,0 +1,7 @@ +Text Data +========= + +.. automodule:: dataprofiler.data_readers.text_data + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.dp_logging.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.dp_logging.rst.txt new file mode 100644 index 000000000..d1c6c910d --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.dp_logging.rst.txt @@ -0,0 +1,7 @@ +Dp Logging +========== + +.. automodule:: dataprofiler.dp_logging + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt new file mode 100644 index 000000000..839a74157 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.base_data_labeler.rst.txt @@ -0,0 +1,7 @@ +Base Data Labeler +================= + +.. automodule:: dataprofiler.labelers.base_data_labeler + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.base_model.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.base_model.rst.txt new file mode 100644 index 000000000..c4bc9b08b --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.base_model.rst.txt @@ -0,0 +1,7 @@ +Base Model +========== + +.. automodule:: dataprofiler.labelers.base_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt new file mode 100644 index 000000000..80113a935 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.character_level_cnn_model.rst.txt @@ -0,0 +1,7 @@ +Character Level Cnn Model +========================= + +.. automodule:: dataprofiler.labelers.character_level_cnn_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt new file mode 100644 index 000000000..4c3624869 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.classification_report_utils.rst.txt @@ -0,0 +1,7 @@ +Classification Report Utils +=========================== + +.. automodule:: dataprofiler.labelers.classification_report_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.data_labelers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.data_labelers.rst.txt new file mode 100644 index 000000000..6ac45d9e6 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.data_labelers.rst.txt @@ -0,0 +1,7 @@ +Data Labelers +============= + +.. automodule:: dataprofiler.labelers.data_labelers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.data_processing.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.data_processing.rst.txt new file mode 100644 index 000000000..58d572a29 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.data_processing.rst.txt @@ -0,0 +1,7 @@ +Data Processing +=============== + +.. automodule:: dataprofiler.labelers.data_processing + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt new file mode 100644 index 000000000..f14cabd5c --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.labeler_utils.rst.txt @@ -0,0 +1,7 @@ +Labeler Utils +============= + +.. automodule:: dataprofiler.labelers.labeler_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.regex_model.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.regex_model.rst.txt new file mode 100644 index 000000000..e85772ad2 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.regex_model.rst.txt @@ -0,0 +1,7 @@ +Regex Model +=========== + +.. automodule:: dataprofiler.labelers.regex_model + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.labelers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.labelers.rst.txt new file mode 100644 index 000000000..86eab391e --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.labelers.rst.txt @@ -0,0 +1,27 @@ +Labelers +======== + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.labelers.base_data_labeler + dataprofiler.labelers.base_model + dataprofiler.labelers.character_level_cnn_model + dataprofiler.labelers.classification_report_utils + dataprofiler.labelers.data_labelers + dataprofiler.labelers.data_processing + dataprofiler.labelers.labeler_utils + dataprofiler.labelers.regex_model + +.. automodule:: dataprofiler.labelers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt new file mode 100644 index 000000000..13cab9ff4 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.base_column_profilers.rst.txt @@ -0,0 +1,7 @@ +Base Column Profilers +===================== + +.. automodule:: dataprofiler.profilers.base_column_profilers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt new file mode 100644 index 000000000..a525d86c8 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.categorical_column_profile.rst.txt @@ -0,0 +1,7 @@ +Categorical Column Profile +========================== + +.. automodule:: dataprofiler.profilers.categorical_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt new file mode 100644 index 000000000..9599deb9a --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.column_profile_compilers.rst.txt @@ -0,0 +1,7 @@ +Column Profile Compilers +======================== + +.. automodule:: dataprofiler.profilers.column_profile_compilers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt new file mode 100644 index 000000000..282408931 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.data_labeler_column_profile.rst.txt @@ -0,0 +1,7 @@ +Data Labeler Column Profile +=========================== + +.. automodule:: dataprofiler.profilers.data_labeler_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt new file mode 100644 index 000000000..d4467634f --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.datetime_column_profile.rst.txt @@ -0,0 +1,7 @@ +Datetime Column Profile +======================= + +.. automodule:: dataprofiler.profilers.datetime_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt new file mode 100644 index 000000000..d23bb4336 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.float_column_profile.rst.txt @@ -0,0 +1,7 @@ +Float Column Profile +==================== + +.. automodule:: dataprofiler.profilers.float_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt new file mode 100644 index 000000000..c20d1f391 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.report_helpers.rst.txt @@ -0,0 +1,7 @@ +Report Helpers +============== + +.. automodule:: dataprofiler.profilers.helpers.report_helpers + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.rst.txt new file mode 100644 index 000000000..b82f660b5 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.helpers.rst.txt @@ -0,0 +1,20 @@ +Helpers +======= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.helpers.report_helpers + +.. automodule:: dataprofiler.profilers.helpers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt new file mode 100644 index 000000000..039bb08d8 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.histogram_utils.rst.txt @@ -0,0 +1,7 @@ +Histogram Utils +=============== + +.. automodule:: dataprofiler.profilers.histogram_utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt new file mode 100644 index 000000000..b6d8d7921 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.int_column_profile.rst.txt @@ -0,0 +1,7 @@ +Int Column Profile +================== + +.. automodule:: dataprofiler.profilers.int_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt new file mode 100644 index 000000000..a8b6ac226 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.numerical_column_stats.rst.txt @@ -0,0 +1,7 @@ +Numerical Column Stats +====================== + +.. automodule:: dataprofiler.profilers.numerical_column_stats + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt new file mode 100644 index 000000000..7b1605659 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.order_column_profile.rst.txt @@ -0,0 +1,7 @@ +Order Column Profile +==================== + +.. automodule:: dataprofiler.profilers.order_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.profile_builder.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.profile_builder.rst.txt new file mode 100644 index 000000000..d73e6d9a2 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.profile_builder.rst.txt @@ -0,0 +1,7 @@ +Profile Builder +=============== + +.. automodule:: dataprofiler.profilers.profile_builder + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.profiler_options.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.profiler_options.rst.txt new file mode 100644 index 000000000..127e8e1ad --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.profiler_options.rst.txt @@ -0,0 +1,7 @@ +Profiler Options +================ + +.. automodule:: dataprofiler.profilers.profiler_options + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.rst.txt new file mode 100644 index 000000000..476df4cec --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.rst.txt @@ -0,0 +1,36 @@ +Profilers +========= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.helpers + +.. toctree:: + :maxdepth: 4 + + dataprofiler.profilers.base_column_profilers + dataprofiler.profilers.categorical_column_profile + dataprofiler.profilers.column_profile_compilers + dataprofiler.profilers.data_labeler_column_profile + dataprofiler.profilers.datetime_column_profile + dataprofiler.profilers.float_column_profile + dataprofiler.profilers.histogram_utils + dataprofiler.profilers.int_column_profile + dataprofiler.profilers.numerical_column_stats + dataprofiler.profilers.order_column_profile + dataprofiler.profilers.profile_builder + dataprofiler.profilers.profiler_options + dataprofiler.profilers.text_column_profile + dataprofiler.profilers.unstructured_labeler_profile + dataprofiler.profilers.unstructured_text_profile + dataprofiler.profilers.utils + +.. automodule:: dataprofiler.profilers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt new file mode 100644 index 000000000..097e6e02c --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.text_column_profile.rst.txt @@ -0,0 +1,7 @@ +Text Column Profile +=================== + +.. automodule:: dataprofiler.profilers.text_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_data_labeler_column_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_data_labeler_column_profile.rst.txt new file mode 100644 index 000000000..412a5224e --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_data_labeler_column_profile.rst.txt @@ -0,0 +1,7 @@ +Unstructured Data Labeler Column Profile +======================================== + +.. automodule:: dataprofiler.profilers.unstructured_data_labeler_column_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt new file mode 100644 index 000000000..c49f68004 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_labeler_profile.rst.txt @@ -0,0 +1,7 @@ +Unstructured Labeler Profile +============================ + +.. automodule:: dataprofiler.profilers.unstructured_labeler_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt new file mode 100644 index 000000000..27b56ea93 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.unstructured_text_profile.rst.txt @@ -0,0 +1,7 @@ +Unstructured Text Profile +========================= + +.. automodule:: dataprofiler.profilers.unstructured_text_profile + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.profilers.utils.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.profilers.utils.rst.txt new file mode 100644 index 000000000..ddae85e63 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.profilers.utils.rst.txt @@ -0,0 +1,7 @@ +Utils +===== + +.. automodule:: dataprofiler.profilers.utils + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.reports.graphs.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.reports.graphs.rst.txt new file mode 100644 index 000000000..3a7adf900 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.reports.graphs.rst.txt @@ -0,0 +1,7 @@ +Graphs +====== + +.. automodule:: dataprofiler.reports.graphs + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.reports.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.reports.rst.txt new file mode 100644 index 000000000..2b4c679c5 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.reports.rst.txt @@ -0,0 +1,20 @@ +Reports +======= + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.reports.graphs + +.. automodule:: dataprofiler.reports + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.rst.txt new file mode 100644 index 000000000..7a57e0427 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.rst.txt @@ -0,0 +1,27 @@ +Dataprofiler +============ + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + dataprofiler.data_readers + dataprofiler.labelers + dataprofiler.profilers + dataprofiler.reports + dataprofiler.validators + +.. toctree:: + :maxdepth: 4 + + dataprofiler.dp_logging + dataprofiler.settings + dataprofiler.version + +.. automodule:: dataprofiler + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.settings.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.settings.rst.txt new file mode 100644 index 000000000..81c664c07 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.settings.rst.txt @@ -0,0 +1,7 @@ +Settings +======== + +.. automodule:: dataprofiler.settings + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.validators.base_validators.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.validators.base_validators.rst.txt new file mode 100644 index 000000000..b8f328736 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.validators.base_validators.rst.txt @@ -0,0 +1,7 @@ +Base Validators +=============== + +.. automodule:: dataprofiler.validators.base_validators + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/dataprofiler.validators.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.validators.rst.txt new file mode 100644 index 000000000..704dfee21 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.validators.rst.txt @@ -0,0 +1,20 @@ +Validators +========== + + +Modules +------- + +.. toctree:: + :maxdepth: 4 + + +.. toctree:: + :maxdepth: 4 + + dataprofiler.validators.base_validators + +.. automodule:: dataprofiler.validators + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/0.7.1/html/_sources/dataprofiler.version.rst.txt b/docs/0.7.1/html/_sources/dataprofiler.version.rst.txt new file mode 100644 index 000000000..3977b6379 --- /dev/null +++ b/docs/0.7.1/html/_sources/dataprofiler.version.rst.txt @@ -0,0 +1,7 @@ +Version +======= + +.. automodule:: dataprofiler.version + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/examples.rst.txt b/docs/0.7.1/html/_sources/examples.rst.txt new file mode 100644 index 000000000..458677b5f --- /dev/null +++ b/docs/0.7.1/html/_sources/examples.rst.txt @@ -0,0 +1,19 @@ +.. _examples: + +Examples +******** + +These examples provide a more in-depth look into the details of the ``Data Profiler`` library. + +Basics +------ + +.. toctree:: + :maxdepth: 0 + + Overview of Data Profiler + Data Reader + Structured Profiler + Unstructured Profiler + Labeler + Adding Models to a Labeler Pipeline diff --git a/docs/0.7.1/html/_sources/graphs.rst.txt b/docs/0.7.1/html/_sources/graphs.rst.txt new file mode 100644 index 000000000..08d7a8063 --- /dev/null +++ b/docs/0.7.1/html/_sources/graphs.rst.txt @@ -0,0 +1,112 @@ +.. _reports: + +Graphs +****** + +Graph Your Data +=============== + +We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples. + +The following plots are currently available to work directly with your profilers: + + * histogram (numeric columns only) + +Below shows how to do so with examples. + +What we need to import +~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + from dataprofiler.reports import graphs + +The main functions that is used to plot histograms are in graphs. **You will also need the `dataprofiler[reports]` requirement to be installed**: + +.. code-block:: console + + pip install 'dataprofiler[reports]' + +Plotting from a StructuredProfiler class +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms. + +.. code-block:: python + + graphs.plot_histograms(profiler, columns) + +These are what the variables mean: + + * **profiler** - StructuredProfiler class variable that contains the data we want + * **columns** - (Optional) The list of IntColumn or FloatColumn we want to specifically plot. + +Plotting an individual IntColumn or FloatColumn +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. + +.. code-block:: python + + graphs.plot_col_histogram(column, axes, title) + +These are what the variables mean: + + * **column** - The IntColumn or FloatColumn we want to plot + * **axes** - (Optional) The axes we want to specify. + * **title** - (Optional) The title of the plot we want to define. + +Examples +~~~~~~~~ + +1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns. + +.. code-block:: python + + import dataprofiler as dp + from dataprofiler.reports import graphs + + + data = [[1, 'a', 1.0], + [2, 'b', 2.2], + [3, 'c', 3.5], + [None, 'd', 10.0]] + profiler = dp.StructuredProfiler(data) + + # This will plot all IntColumn and FloatColumn as histograms (The first and last column). + fig = graphs.plot_histograms(profiler) + fig.show() + + # This will only plot the specified column, 0. + columns = [0] + fig = graphs.plot_histograms(profiler, columns) + fig.show() + +.. image:: _static/images/histogram_example_0.png + :alt: First Histogram Example Image + +.. image:: _static/images/histogram_example_1.png + :alt: Second Histogram Example Image + +2. This example demonstrates how we can plot a low level profiler. + +.. code-block:: python + + import pandas as pd + + from dataprofiler.profilers import IntColumn + from dataprofiler.reports import graphs + + + data = pd.Series([1, 2, 3], dtype=str) + profiler = IntColumn('example') + profiler.update(data) + + # Plot the axes + ax = graphs.plot_col_histogram(profiler) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() + +.. image:: _static/images/histogram_example_2.png + :alt: Histogram Column Only Example Image diff --git a/docs/0.7.1/html/_sources/index.rst.txt b/docs/0.7.1/html/_sources/index.rst.txt new file mode 100644 index 000000000..aaefa381b --- /dev/null +++ b/docs/0.7.1/html/_sources/index.rst.txt @@ -0,0 +1,460 @@ +.. _Data Profiler: + +==================================== +Data Profiler | What's in your data? +==================================== + +Purpose +======= + +The DataProfiler is a Python library designed to make data analysis, monitoring and **sensitive data detection** easy. + +Loading **Data** with a single command, the library automatically formats & loads files into a DataFrame. **Profiling** the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports. + +The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify **sensitive data** (or **PII**). If customization is needed, it's easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition. + +The best part? Getting started only takes a few lines of code (`Example CSV`_): + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + readable_report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(readable_report, indent=4)) + + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +If you have suggestions or find a bug, please open an `issue`_. + +Visit the :ref:`API` to explore Data Profiler's terminology. + + +What is a Data Profile? +======================= + +In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are "global statistics" or `global_stats`, which contain dataset level data and there are "column/row level statistics" or `data_stats` (each column is a new key-value entry). + +The format for a structured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "column_count": int, + "row_count": int, + "row_has_null_ratio": float, + "row_is_null_ratio": float, + "unique_row_ratio": float, + "duplicate_row_count": int, + "file_type": string, + "encoding": string, + "correlation_matrix": list(list(int)), (*) + "profile_schema": dict[string, list(int)] + }, + "data_stats": [ + { + "column_name": string, + "data_type": string, + "data_label": string, + "categorical": bool, + "order": string, + "samples": list(str), + "statistics": { + "sample_size": int, + "null_count": int, + "null_types": list(string), + "null_types_index": dict[string, list(int)], + "data_type_representation": dict[string, list(string)], + "min": [null, float], + "max": [null, float], + "sum": float, + "mean": float, + "variance": float, + "stddev": float, + "skewness": float, + "kurtosis": float, + "num_zeros": int, + "num_negatives": int, + "histogram": { + "bin_counts": list(int), + "bin_edges": list(float), + }, + "quantiles": { + int: float + }, + "vocab": list(char), + "avg_predictions": dict[string, float], + "data_label_representation": dict[string, float], + "categories": list(str), + "unique_count": int, + "unique_ratio": float, + "categorical_count": dict[string, int], + "gini_impurity": float, + "unalikeability": float, + "precision": { + 'min': int, + 'max': int, + 'mean': float, + 'var': float, + 'std': float, + 'sample_size': int, + 'margin_of_error': float, + 'confidence_level': float + }, + "times": dict[string, float], + "format": string + } + } + ] + +(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True. + +The format for an unstructured profile is below: + +.. code-block:: python + + "global_stats": { + "samples_used": int, + "empty_line_count": int, + "file_type": string, + "encoding": string, + "memory_size": float, # in MB + }, + "data_stats": { + "data_label": { + "entity_counts": { + "word_level": dict[string, int], + "true_char_level": dict[string, int], + "postprocess_char_level": dict[string, int] + }, + "entity_percentages": { + "word_level": dict[string, float], + "true_char_level": dict[string, float], + "postprocess_char_level": dict[string, float] + }, + "times": dict[string, float] + }, + "statistics": { + "vocab": list(char), + "vocab_count": dict[string, int], + "words": list(string), + "word_count": dict[string, int], + "times": dict[string, float] + } + } + + +Supported Data Formats +~~~~~~~~~~~~~~~~~~~~~~ + +* Any delimited file (CSV, TSV, etc.) +* JSON object +* Avro file +* Parquet file +* Text file +* Pandas DataFrame +* A URL that points to one of the supported file types above + + +Data Labels +~~~~~~~~~~~ + +*Data Labels* are determined per cell for structured data (column/row when the *profiler* is used) or at the character level for unstructured data. + +* UNKNOWN +* ADDRESS +* BAN (bank account number, 10-18 digits) +* CREDIT_CARD +* EMAIL_ADDRESS +* UUID +* HASH_OR_KEY (md5, sha1, sha256, random hash, etc.) +* IPV4 +* IPV6 +* MAC_ADDRESS +* PERSON +* PHONE_NUMBER +* SSN +* URL +* US_STATE +* DRIVERS_LICENSE +* DATE +* TIME +* DATETIME +* INTEGER +* FLOAT +* QUANTITY +* ORDINAL + + +Get Started +=========== + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Unstructured Profiler +~~~~~~~~~~~~~~~~~~~~~ + +In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as `unstructured`. Below is an example of unstructured profile +with a text file. + +.. code-block:: python + + import dataprofiler as dp + import json + my_text = dp.Data('text_file.txt') + profile = dp.Profiler(my_text) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Another example of unstructured profile with pd.Series of string is given as below + +.. code-block:: python + + import dataprofiler as dp + import pandas as pd + import json + + text_data = pd.Series(['first string', 'second string']) + profile = dp.Profiler(text_data, profiler_type="unstructured") + + # print the report using json to prettify. + report = profile.report(report_options={"output_format":"pretty"}) + print(json.dumps(report, indent=4)) + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + import os + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4)) + + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Getting Started: + + Intro + install.rst + data_readers.rst + profiler.rst + data_labeling.rst + graphs.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: User Guide: + + examples.rst + API.rst + +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Community: + + Changelog + Feedback + GitHub + Contributing + +.. _Example CSV: https://raw.githubusercontent.com/capitalone/DataProfiler/main/dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv +.. _issue: https://github.com/capitalone/DataProfiler/issues/new/choose + +Versions +======== +* `0.7.1`_ +* `0.7.0`_ +* `0.6.0`_ +* `0.5.3`_ +* `0.5.2`_ +* `0.5.1`_ +* `0.5.0`_ +* `0.4.7`_ +* `0.4.6`_ +* `0.4.5`_ +* `0.4.4`_ +* `0.4.3`_ +* `0.3.0`_ + +.. _0.3.0: ../../v0.3/html/index.html +.. _0.4.3: ../../0.4.3/html/index.html + +.. _0.4.4: ../../0.4.4/html/index.html + +.. _0.4.5: ../../0.4.5/html/index.html + +.. _0.4.6: ../../0.4.6/html/index.html + +.. _0.4.7: ../../0.4.7/html/index.html + +.. _0.5.0: ../../0.5.0/html/index.html + +.. _0.5.1: ../../0.5.1/html/index.html + +.. _0.5.2: ../../0.5.2/html/index.html + +.. _0.5.3: ../../0.5.3/html/index.html +.. _0.6.0: ../../0.6.0/html/index.html + +.. _0.7.0: ../../0.7.0/html/index.html + +.. _0.7.1: ../../0.7.1/html/index.html + diff --git a/docs/0.7.1/html/_sources/install.rst.txt b/docs/0.7.1/html/_sources/install.rst.txt new file mode 100644 index 000000000..70cdb1a2a --- /dev/null +++ b/docs/0.7.1/html/_sources/install.rst.txt @@ -0,0 +1,138 @@ +.. _install: + +Install +******* + +To install the full package from pypi: + +.. code-block:: console + + pip install DataProfiler[ml] + +If the ML requirements are too strict (say, you don't want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler) + +Install from pypi: + +.. code-block:: console + + pip install DataProfiler + +Snappy Installation +=================== + +This is required to profile parquet/avro datasets + +MacOS with homebrew: + +.. code-block:: console + + brew install snappy + + +Linux install: + +.. code-block:: console + + sudo apt-get -y install libsnappy-dev + + +Build From Scratch +================== + +NOTE: Installation for python3 + +virtualenv install: + +.. code-block:: console + + python3 -m pip install virtualenv + + +Setup virtual env: + +.. code-block:: console + + python3 -m virtualenv --python=python3 venv3 + source venv3/bin/activate + + +Install requirements: + +.. code-block:: console + + pip3 install -r requirements.txt + +Install labeler dependencies: + +.. code-block:: console + + pip3 install -r requirements-ml.txt + + +Install via the repo -- Build setup.py and install locally: + +.. code-block:: console + + python3 setup.py sdist bdist bdist_wheel + pip3 install dist/DataProfiler*-py3-none-any.whl + + +If you see: + +.. code-block:: console + + ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler') + +This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above. + +Install via github: + +.. code-block:: console + + pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler + + + +Testing +======= + +For testing, install test requirements: + +.. code-block:: console + + pip3 install -r requirements-test.txt + + +To run all unit tests, use: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py" + + +To run file of unit tests, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py + + +To run a file with Pytest use: + +.. code-block:: console + + DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v + + +To run individual of unit test, use form: + +.. code-block:: console + + DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler + + diff --git a/docs/0.7.1/html/_sources/labeler.nblink.txt b/docs/0.7.1/html/_sources/labeler.nblink.txt new file mode 100644 index 000000000..bed6517bf --- /dev/null +++ b/docs/0.7.1/html/_sources/labeler.nblink.txt @@ -0,0 +1,6 @@ +{ + "path": "../../feature_branch/examples/labeler.ipynb", + "extra-media": [ + "../../feature_branch/examples/DL-Flowchart.png" + ] +} \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/modules.rst.txt b/docs/0.7.1/html/_sources/modules.rst.txt new file mode 100644 index 000000000..0593459df --- /dev/null +++ b/docs/0.7.1/html/_sources/modules.rst.txt @@ -0,0 +1,7 @@ +dataprofiler +============ + +.. toctree:: + :maxdepth: 4 + + dataprofiler diff --git a/docs/0.7.1/html/_sources/overview.nblink.txt b/docs/0.7.1/html/_sources/overview.nblink.txt new file mode 100644 index 000000000..3d9f89d3d --- /dev/null +++ b/docs/0.7.1/html/_sources/overview.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/intro_data_profiler.ipynb" +} \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/profiler.rst.txt b/docs/0.7.1/html/_sources/profiler.rst.txt new file mode 100644 index 000000000..0fbfc5923 --- /dev/null +++ b/docs/0.7.1/html/_sources/profiler.rst.txt @@ -0,0 +1,678 @@ +.. _profiler: + +Profiler +******** + +Profile Your Data +================= + +Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text + + profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc + + readable_report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(readable_report, indent=4)) + +If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page. + +Load a File +~~~~~~~~~~~ + +The profiler should automatically identify the file type and load the data into a `Data Class`. + +Along with other attributtes the `Data class` enables structured data to be accessed via a valid Pandas DataFrame. + +.. code-block:: python + + # Load a csv file, return a CSVData object + csv_data = Data('your_file.csv') + + # Print the first 10 rows of the csv file + print(csv_data.data.head(10)) + + # Load a parquet file, return a ParquetData object + parquet_data = Data('your_file.parquet') + + # Sort the data by the name column + parquet_data.data.sort_values(by='name', inplace=True) + + # Print the sorted first 10 rows of the parquet data + print(parquet_data.data.head(10)) + + +If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers. + +Profile a File +~~~~~~~~~~~~~~ + +Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load file (CSV should be automatically identified) + data = Data("your_file.csv") + + # Profile the dataset + profile = Profiler(data) + + # Generate a report and use json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + + # Print the report + print(json.dumps(report, indent=4)) + +Updating Profiles +~~~~~~~~~~~~~~~~~ + +Currently, the data profiler is equipped to update its profile in batches. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load and profile a CSV file + data = Data("your_file.csv") + profile = Profiler(data) + + # Update the profile with new data: + new_data = Data("new_data.csv") + profile.update_profile(new_data) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Merging Profiles +~~~~~~~~~~~~~~~~ + +If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator. + +This also enables profiles to be determined in a distributed manner. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file with a schema + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file with the same schema + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + profile3 = profile1 + profile2 + + # Print the report using json to prettify. + report = profile3.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Profile Differences +~~~~~~~~~~~~~~~~~~~ + +Profile differences take two profiles and find the differences +between them. Create the difference report like this: + +.. code-block:: python + + from dataprofiler import Data, Profiler + + # Load a CSV file + data1 = Data("file_a.csv") + profile1 = Profiler(data) + + # Load another CSV file + data2 = Data("file_b.csv") + profile2 = Profiler(data) + + diff_report = profile1.diff(profile2) + print(diff_report) + +The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference: + +* **Int/Float** - One profile subtracts the value from the other. + +* **String** - The strings will be shown in a list: + + - [profile1 str, profile2 str] +* **List** - A list of 3 will be returned showing the unique values of + each profile and the shared values: + + - [profile 1 unique values, shared values, profile 2 unique values] +* **Dict** - Some dictionaries with varied keys will also return a list + of three in the format: + + - [profile 1 unique key-values, shared key differences, profile 2 unique key-values] + +Otherwise, when no differences occur: + +* **Any Type No Differences** - A string will report: "unchanged". + +Below is the structured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'column_count': int, + 'row_count': int, + 'row_has_null_ratio': float, + 'row_is_null_ratio': float, + 'unique_row_ratio': float, + 'duplicate_row_count': int, + 'correlation_matrix': list[list[float]], + 'profile_schema': list[dict[str, int]] + }, + 'data_stats': [{ + 'column_name': str, + 'data_type': [str, str], + 'data_label': [list[str], list[str], list[str]], + 'categorical': [str, str], + 'order': [str, str], + 'statistics': { + 'min': float, + 'max': float, + 'sum': float, + 'mean': float, + 'variance': float, + 'stddev': float, + 't-test': { + 't-statistic': float, + 'conservative': {'df': int, + 'p-value': float}, + 'welch': {'df': float, + 'p-value': float}}, + "chi2-test": { + "chi2-statistic": float, + "df": int, + "p-value": float + }, + 'unique_count': int, + 'unique_ratio': float, + 'categories': [list[str], list[str], list[str]], + 'gini_impurity': float, + 'unalikeability': float, + 'categorical_count': [dict[str, int], dict[str, int], dict[str, int]], + 'avg_predictions': [dict[str, float]], + 'label_representation': [dict[str, float]], + 'sample_size': int, + 'null_count': int, + 'null_types': [list[str], list[str], list[str]], + 'null_types_index': [dict[str, int], dict[str, int], dict[str, int]], + 'data_type_representation': [dict[str, float]] + } + } + +Below is the unstructured difference report: + +.. code-block:: python + + { + 'global_stats': { + 'file_type': [str, str], + 'encoding': [str, str], + 'samples_used': int, + 'empty_line_count': int, + 'memory_size': float + }, + 'data_stats': { + 'data_label': { + 'entity_counts': { + 'word_level': dict[str, int], + 'true_char_level': dict[str, int], + 'postprocess_char_level': dict[str, int] + }, + 'entity_percentages': { + 'word_level': dict[str, float], + 'true_char_level': dict[str, float], + 'postprocess_char_level': dict[str, float] + } + }, + 'statistics': { + 'vocab': [list[str], list[str], list[str]], + 'vocab_count': [dict[str, int], dict[str, int], dict[str, int]], + 'words': [list[str], list[str], list[str]], + 'word_count': [dict[str, int], dict[str, int], dict[str, int]] + } + } + } + + +Saving and Loading a Profile +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The profiles can easily be saved and loaded as shown below: + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Load a CSV file, with "," as the delimiter + data = Data("your_file.csv") + + # Read in profile and print results + profile = Profiler(data) + profile.save(filepath="my_profile.pkl") + + loaded_profile = dp.Profiler.load("my_profile.pkl") + print(json.dumps(loaded_profile.report(report_options={"output_format": "compact"}), + indent=4)) + + +Structured vs Unstructured Profiles +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below: + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + + # Creating a structured profile + data1 = Data("normal_csv_file.csv") + structured_profile = Profiler(data1, profiler_type="structured") + + structured_report = structured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(structured_report, indent=4)) + + # Creating an unstructured profile + data2 = Data("normal_text_file.txt") + unstructured_profile = Profiler(data2, profiler_type="unstructured") + + unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(unstructured_report, indent=4)) + + +Setting the Sample Size +~~~~~~~~~~~~~~~~~~~~~~~ + +There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2) + +The first two samples (1.0 and NULL) are used for the statistical analysis. + +In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example: + +.. code-block:: python + + from dataprofiler import Profiler + + sample_array = [1.0, NULL, 2.0] + profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2) + +This will use all samples in the statistical analysis until the number of "true" +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples. + +Profile a Pandas DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + import pandas as pd + import dataprofiler as dp + import json + + my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]]) + profile = dp.Profiler(my_dataframe) + + # print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + # read a specified column, in this case it is labeled 0: + print(json.dumps(report["data stats"][0], indent=4)) + + +Specifying a Filetype or Delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a CSV data type, with a `,` delimiter. +In addition, it utilizes only the first 10,000 rows. + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler + from dataprofiler.data_readers.csv_data import CSVData + + # Load a CSV file, with "," as the delimiter + data = CSVData("your_file.csv", options={"delimiter": ","}) + + # Split the data, such that only the first 10,000 rows are used + data = data.data[0:10000] + + # Read in profile and print results + profile = Profiler(data) + print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4)) + +Setting Profiler Seed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Example of specifying a seed for reproducibility. + +.. code-block:: python + + import dataprofiler as dp + + # Set seed to non-negative integer value or None + dp.set_seed(0) + + + +Profile Options +=============== + +The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options). + +.. code-block:: python + + import json + from dataprofiler import Data, Profiler, ProfilerOptions + + # Load and profile a CSV file + data = Data("your_file.csv") + profile_options = ProfilerOptions() + + #All of these are different examples of adjusting the profile options + + # Options can be toggled directly like this: + profile_options.structured_options.text.is_enabled = False + profile_options.structured_options.text.vocab.is_enabled = True + profile_options.structured_options.int.variance.is_enabled = True + profile_options.structured_options.data_labeler.data_labeler_dirpath = \ + "Wheres/My/Datalabeler" + profile_options.structured_options.data_labeler.is_enabled = False + + # A dictionary can be sent in to set the properties for all the options + profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False}) + + # Specific columns can be set/disabled/enabled in the same way + profile_options.structured_options.text.set({"max.is_enabled":True, + "variance.is_enabled": True}) + + # numeric stats can be turned off/on entirely + profile_options.set({"is_numeric_stats_enabled": False}) + profile_options.set({"int.is_numeric_stats_enabled": False}) + + profile = Profiler(data, options=profile_options) + + # Print the report using json to prettify. + report = profile.report(report_options={"output_format": "pretty"}) + print(json.dumps(report, indent=4)) + + +Below is an breakdown of all the options. + +* **ProfilerOptions** - The top-level options class that contains options for the Profiler class + + * **structured_options** - Options responsible for all structured data + + * **multiprocess** - Option to enable multiprocessing. Automatically selects the optimal number of processes to utilize based on system constraints. + + * is_enabled - (Boolean) Enables or disables multiprocessing + * **int** - Options for the integer columns + + * is_enabled - (Boolean) Enables or disables the integer operations + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **float** - Options for the float columns + + * is_enabled - (Boolean) Enables or disables the float operations + * precision - Finds the precision (significant figures) within the column + + * is_enabled - (Boolean) Enables or disables precision + * sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision + + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * num_zeros - Finds the count of zeros in a column + + * is_enabled - (Boolean) Enables or disables num_zeros + * num_negatives - Finds the count of negative numbers in a column + + * is_enabled - (Boolean) Enables or disables num_negatives + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **text** - Options for the text columns + + * is_enabled - (Boolean) Enables or disables the text operations + * vocab - Finds all the unique characters used in a column + + * is_enabled - (Boolean) Enables or disables vocab + * min - Finds minimum value in a column + + * is_enabled - (Boolean) Enables or disables min + * max - Finds maximum value in a column + + * is_enabled - (Boolean) Enables or disables max + * sum - Finds sum of all values in a column + + * is_enabled - (Boolean) Enables or disables sum + * variance - Finds variance of all values in a column + + * is_enabled - (Boolean) Enables or disables variance + * skewness - Finds skewness of all values in a column + + * is_enabled - (Boolean) Enables or disables skewness + * kurtosis - Finds kurtosis of all values in a column + + * is_enabled - (Boolean) Enables or disables kurtosis + * bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations + + * is_enabled - (Boolean) Enables or disables bias correction + * histogram_and_quantiles - Generates a histogram and quantiles + from the column values + + * bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. + If left unspecified (None) the optimal method will be chosen by attempting all methods. + If multiple specified (list) the optimal method will be chosen by attempting the provided ones. + methods: 'auto', 'fd', 'doane', 'scott', 'rice', 'sturges', 'sqrt' + Note: 'auto' is used to choose optimally between 'fd' and 'sturges' + * is_enabled - (Boolean) Enables or disables histogram and quantiles + * **datetime** - Options for the datetime columns + + * is_enabled - (Boolean) Enables or disables the datetime operations + * **order** - Options for the order columns + + * is_enabled - (Boolean) Enables or disables the order operations + * **category** - Options for the category columns + + * is_enabled - (Boolean) Enables or disables the category operations + * **data_labeler** - Options for the data labeler columns + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + * **unstructured_options** - Options responsible for all unstructured data + + * **text** - Options for the text profile + + * is_case_sensitive - (Boolean) Specify whether the profile is case sensitive + * stop_words - (List of Strings) List of stop words to be removed when profiling + * top_k_chars - (Int) Number of top characters to be retrieved when profiling + * top_k_words - (Int) Number of top words to be retrieved when profiling + * vocab - Options for vocab count + + * is_enabled - (Boolean) Enables or disables the vocab stats + * words - Options for word count + + * is_enabled - (Boolean) Enables or disables the word stats + * **data_labeler** - Options for the data labeler + + * is_enabled - (Boolean) Enables or disables the data labeler operations + * data_labeler_dirpath - (String) Directory path to data labeler + * data_labeler_object - (BaseDataLabeler) Datalabeler to replace + the default labeler + * max_sample_size - (Int) The max number of samples for the data + labeler + + + +Statistical Dependency on Order of Updates +========================================== + +Some profile features/statistics are dependent on the order in which the profiler +is updated with new data. + +Order Profile +~~~~~~~~~~~~~ + +The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order. + +For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value `>=` than that of the +previous batch of data. + +Ex. of ascending: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [3, 4, 5] + +Ex. of random: + +.. code-block:: python + + batch_1 = [0, 1, 2] + batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch + + +Reporting Structure +=================== + +For every profile, we can provide a report and customize it with a couple optional parameters: + +* output_format (string) + + * This will allow the user to decide the output format for report. + + * Options are one of [pretty, compact, serializable, flat]: + + * Pretty: floats are rounded to four decimal places, and lists are shortened. + * Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc. + * Serializable: Output is json serializable and not prettified + * Flat: Nested output is returned as a flattened dictionary +* num_quantile_groups (int) + + * You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups! + +.. code-block:: python + + report = profile.report(report_options={"output_format": "pretty"}) + report = profile.report(report_options={"output_format": "compact"}) + report = profile.report(report_options={"output_format": "serializable"}) + report = profile.report(report_options={"output_format": "flat"}) + diff --git a/docs/0.7.1/html/_sources/profiler_example.nblink.txt b/docs/0.7.1/html/_sources/profiler_example.nblink.txt new file mode 100644 index 000000000..8b1612784 --- /dev/null +++ b/docs/0.7.1/html/_sources/profiler_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/structured_profilers.ipynb" +} \ No newline at end of file diff --git a/docs/0.7.1/html/_sources/unstructured_profiler_example.nblink.txt b/docs/0.7.1/html/_sources/unstructured_profiler_example.nblink.txt new file mode 100644 index 000000000..1589c41d4 --- /dev/null +++ b/docs/0.7.1/html/_sources/unstructured_profiler_example.nblink.txt @@ -0,0 +1,3 @@ +{ + "path": "../../feature_branch/examples/unstructured_profilers.ipynb" +} \ No newline at end of file diff --git a/docs/0.7.1/html/_static/DataProfilerLogoLightTheme.png b/docs/0.7.1/html/_static/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/docs/0.7.1/html/_static/DataProfilerLogoLightTheme.png differ diff --git a/docs/0.7.1/html/_static/basic.css b/docs/0.7.1/html/_static/basic.css new file mode 100644 index 000000000..912859b55 --- /dev/null +++ b/docs/0.7.1/html/_static/basic.css @@ -0,0 +1,904 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 450px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a.brackets:before, +span.brackets > a:before{ + content: "["; +} + +a.brackets:after, +span.brackets > a:after { + content: "]"; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +table.footnote td, table.footnote th { + border: 0 !important; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +dl.footnote > dt, +dl.citation > dt { + float: left; + margin-right: 0.5em; +} + +dl.footnote > dd, +dl.citation > dd { + margin-bottom: 0em; +} + +dl.footnote > dd:after, +dl.citation > dd:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dt:after { + content: ":"; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0.5em; + content: ":"; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/docs/0.7.1/html/_static/custom.css b/docs/0.7.1/html/_static/custom.css new file mode 100644 index 000000000..8a7c7cb54 --- /dev/null +++ b/docs/0.7.1/html/_static/custom.css @@ -0,0 +1,50 @@ +/* + the ipython3 code blocks coming from the notebooks + were not getting the dark theme styles applied, so + manually overriding them +*/ +@media (prefers-color-scheme: dark) { + .highlight-ipython3 { + border: none !important; + border-radius: 2px !important; + background: #202020 !important; + color: #d0d0d0 !important; + } +} + +@media (prefers-color-scheme: dark) { + tr:nth-child(odd) { + background-color: #202020 !important; + } +} + +@media (prefers-color-scheme: dark) { + .dataframe { + color: white !important; + } +} + +.hidden { + display: none; +} + +.version { + text-align: right; + font-size: 24px; + margin-top: -47px; + margin-right: 3px; +} + +.sidebar-brand { + margin-bottom: -10px; + margin-top: 10px; +} + +/* unknown warning was showing, manually hiding */ +#Visualizing-Logged-Dataframes .admonition.warning { + display: none; +} + +div.output_area.stderr { + display: none; +} diff --git a/docs/0.7.1/html/_static/doctools.js b/docs/0.7.1/html/_static/doctools.js new file mode 100644 index 000000000..8cbf1b161 --- /dev/null +++ b/docs/0.7.1/html/_static/doctools.js @@ -0,0 +1,323 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for all documentation. + * + * :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/** + * select a different prefix for underscore + */ +$u = _.noConflict(); + +/** + * make the code below compatible with browsers without + * an installed firebug like debugger +if (!window.console || !console.firebug) { + var names = ["log", "debug", "info", "warn", "error", "assert", "dir", + "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", + "profile", "profileEnd"]; + window.console = {}; + for (var i = 0; i < names.length; ++i) + window.console[names[i]] = function() {}; +} + */ + +/** + * small helper function to urldecode strings + * + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent#Decoding_query_parameters_from_a_URL + */ +jQuery.urldecode = function(x) { + if (!x) { + return x + } + return decodeURIComponent(x.replace(/\+/g, ' ')); +}; + +/** + * small helper function to urlencode strings + */ +jQuery.urlencode = encodeURIComponent; + +/** + * This function returns the parsed url parameters of the + * current request. Multiple values per key are supported, + * it will always return arrays of strings for the value parts. + */ +jQuery.getQueryParameters = function(s) { + if (typeof s === 'undefined') + s = document.location.search; + var parts = s.substr(s.indexOf('?') + 1).split('&'); + var result = {}; + for (var i = 0; i < parts.length; i++) { + var tmp = parts[i].split('=', 2); + var key = jQuery.urldecode(tmp[0]); + var value = jQuery.urldecode(tmp[1]); + if (key in result) + result[key].push(value); + else + result[key] = [value]; + } + return result; +}; + +/** + * highlight a given string on a jquery object by wrapping it in + * span elements with the given class name. + */ +jQuery.fn.highlightText = function(text, className) { + function highlight(node, addItems) { + if (node.nodeType === 3) { + var val = node.nodeValue; + var pos = val.toLowerCase().indexOf(text); + if (pos >= 0 && + !jQuery(node.parentNode).hasClass(className) && + !jQuery(node.parentNode).hasClass("nohighlight")) { + var span; + var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.className = className; + } + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + node.parentNode.insertBefore(span, node.parentNode.insertBefore( + document.createTextNode(val.substr(pos + text.length)), + node.nextSibling)); + node.nodeValue = val.substr(0, pos); + if (isInSVG) { + var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); + var bbox = node.parentElement.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute('class', className); + addItems.push({ + "parent": node.parentNode, + "target": rect}); + } + } + } + else if (!jQuery(node).is("button, select, textarea")) { + jQuery.each(node.childNodes, function() { + highlight(this, addItems); + }); + } + } + var addItems = []; + var result = this.each(function() { + highlight(this, addItems); + }); + for (var i = 0; i < addItems.length; ++i) { + jQuery(addItems[i].parent).before(addItems[i].target); + } + return result; +}; + +/* + * backward compatibility for jQuery.browser + * This will be supported until firefox bug is fixed. + */ +if (!jQuery.browser) { + jQuery.uaMatch = function(ua) { + ua = ua.toLowerCase(); + + var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || + /(webkit)[ \/]([\w.]+)/.exec(ua) || + /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || + /(msie) ([\w.]+)/.exec(ua) || + ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || + []; + + return { + browser: match[ 1 ] || "", + version: match[ 2 ] || "0" + }; + }; + jQuery.browser = {}; + jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; +} + +/** + * Small JavaScript module for the documentation. + */ +var Documentation = { + + init : function() { + this.fixFirefoxAnchorBug(); + this.highlightSearchWords(); + this.initIndexTable(); + if (DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) { + this.initOnKeyListeners(); + } + }, + + /** + * i18n support + */ + TRANSLATIONS : {}, + PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, + LOCALE : 'unknown', + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext : function(string) { + var translated = Documentation.TRANSLATIONS[string]; + if (typeof translated === 'undefined') + return string; + return (typeof translated === 'string') ? translated : translated[0]; + }, + + ngettext : function(singular, plural, n) { + var translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated === 'undefined') + return (n == 1) ? singular : plural; + return translated[Documentation.PLURALEXPR(n)]; + }, + + addTranslations : function(catalog) { + for (var key in catalog.messages) + this.TRANSLATIONS[key] = catalog.messages[key]; + this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); + this.LOCALE = catalog.locale; + }, + + /** + * add context elements like header anchor links + */ + addContextElements : function() { + $('div[id] > :header:first').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this headline')). + appendTo(this); + }); + $('dt[id]').each(function() { + $('\u00B6'). + attr('href', '#' + this.id). + attr('title', _('Permalink to this definition')). + appendTo(this); + }); + }, + + /** + * workaround a firefox stupidity + * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + */ + fixFirefoxAnchorBug : function() { + if (document.location.hash && $.browser.mozilla) + window.setTimeout(function() { + document.location.href += ''; + }, 10); + }, + + /** + * highlight the search words provided in the url in the text + */ + highlightSearchWords : function() { + var params = $.getQueryParameters(); + var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; + if (terms.length) { + var body = $('div.body'); + if (!body.length) { + body = $('body'); + } + window.setTimeout(function() { + $.each(terms, function() { + body.highlightText(this.toLowerCase(), 'highlighted'); + }); + }, 10); + $('') + .appendTo($('#searchbox')); + } + }, + + /** + * init the domain index toggle buttons + */ + initIndexTable : function() { + var togglers = $('img.toggler').click(function() { + var src = $(this).attr('src'); + var idnum = $(this).attr('id').substr(7); + $('tr.cg-' + idnum).toggle(); + if (src.substr(-9) === 'minus.png') + $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); + else + $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); + }).css('display', ''); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { + togglers.click(); + } + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords : function() { + $('#searchbox .highlight-link').fadeOut(300); + $('span.highlighted').removeClass('highlighted'); + }, + + /** + * make the url absolute + */ + makeURL : function(relativeURL) { + return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; + }, + + /** + * get the current relative url + */ + getCurrentURL : function() { + var path = document.location.pathname; + var parts = path.split(/\//); + $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { + if (this === '..') + parts.pop(); + }); + var url = parts.join('/'); + return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + }, + + initOnKeyListeners: function() { + $(document).keydown(function(event) { + var activeElementType = document.activeElement.tagName; + // don't navigate when in search box, textarea, dropdown or button + if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT' + && activeElementType !== 'BUTTON' && !event.altKey && !event.ctrlKey && !event.metaKey + && !event.shiftKey) { + switch (event.keyCode) { + case 37: // left + var prevHref = $('link[rel="prev"]').prop('href'); + if (prevHref) { + window.location.href = prevHref; + return false; + } + break; + case 39: // right + var nextHref = $('link[rel="next"]').prop('href'); + if (nextHref) { + window.location.href = nextHref; + return false; + } + break; + } + } + }); + } +}; + +// quick alias for translations +_ = Documentation.gettext; + +$(document).ready(function() { + Documentation.init(); +}); diff --git a/docs/0.7.1/html/_static/documentation_options.js b/docs/0.7.1/html/_static/documentation_options.js new file mode 100644 index 000000000..2fa8c97fe --- /dev/null +++ b/docs/0.7.1/html/_static/documentation_options.js @@ -0,0 +1,12 @@ +var DOCUMENTATION_OPTIONS = { + URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), + VERSION: '', + LANGUAGE: 'None', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false +}; \ No newline at end of file diff --git a/docs/0.7.1/html/_static/file.png b/docs/0.7.1/html/_static/file.png new file mode 100644 index 000000000..a858a410e Binary files /dev/null and b/docs/0.7.1/html/_static/file.png differ diff --git a/docs/0.7.1/html/_static/images/DataProfilerDarkLogoLong.png b/docs/0.7.1/html/_static/images/DataProfilerDarkLogoLong.png new file mode 100644 index 000000000..a339e0f6a Binary files /dev/null and b/docs/0.7.1/html/_static/images/DataProfilerDarkLogoLong.png differ diff --git a/docs/0.7.1/html/_static/images/DataProfilerLogoLightTheme.png b/docs/0.7.1/html/_static/images/DataProfilerLogoLightTheme.png new file mode 100644 index 000000000..35e59c349 Binary files /dev/null and b/docs/0.7.1/html/_static/images/DataProfilerLogoLightTheme.png differ diff --git a/docs/0.7.1/html/_static/images/DataProfilerLogoLightThemeLong.png b/docs/0.7.1/html/_static/images/DataProfilerLogoLightThemeLong.png new file mode 100644 index 000000000..ca86fe167 Binary files /dev/null and b/docs/0.7.1/html/_static/images/DataProfilerLogoLightThemeLong.png differ diff --git a/docs/0.7.1/html/_static/images/histogram_example_0.png b/docs/0.7.1/html/_static/images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/docs/0.7.1/html/_static/images/histogram_example_0.png differ diff --git a/docs/0.7.1/html/_static/images/histogram_example_1.png b/docs/0.7.1/html/_static/images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/docs/0.7.1/html/_static/images/histogram_example_1.png differ diff --git a/docs/0.7.1/html/_static/images/histogram_example_2.png b/docs/0.7.1/html/_static/images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/docs/0.7.1/html/_static/images/histogram_example_2.png differ diff --git a/docs/0.7.1/html/_static/jquery-3.5.1.js b/docs/0.7.1/html/_static/jquery-3.5.1.js new file mode 100644 index 000000000..50937333b --- /dev/null +++ b/docs/0.7.1/html/_static/jquery-3.5.1.js @@ -0,0 +1,10872 @@ +/*! + * jQuery JavaScript Library v3.5.1 + * https://jquery.com/ + * + * Includes Sizzle.js + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://jquery.org/license + * + * Date: 2020-05-04T22:49Z + */ +( function( global, factory ) { + + "use strict"; + + if ( typeof module === "object" && typeof module.exports === "object" ) { + + // For CommonJS and CommonJS-like environments where a proper `window` + // is present, execute the factory and get jQuery. + // For environments that do not have a `window` with a `document` + // (such as Node.js), expose a factory as module.exports. + // This accentuates the need for the creation of a real `window`. + // e.g. var jQuery = require("jquery")(window); + // See ticket #14549 for more info. + module.exports = global.document ? + factory( global, true ) : + function( w ) { + if ( !w.document ) { + throw new Error( "jQuery requires a window with a document" ); + } + return factory( w ); + }; + } else { + factory( global ); + } + +// Pass this if window is not defined yet +} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { + +// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 +// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode +// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common +// enough that all such attempts are guarded in a try block. +"use strict"; + +var arr = []; + +var getProto = Object.getPrototypeOf; + +var slice = arr.slice; + +var flat = arr.flat ? function( array ) { + return arr.flat.call( array ); +} : function( array ) { + return arr.concat.apply( [], array ); +}; + + +var push = arr.push; + +var indexOf = arr.indexOf; + +var class2type = {}; + +var toString = class2type.toString; + +var hasOwn = class2type.hasOwnProperty; + +var fnToString = hasOwn.toString; + +var ObjectFunctionString = fnToString.call( Object ); + +var support = {}; + +var isFunction = function isFunction( obj ) { + + // Support: Chrome <=57, Firefox <=52 + // In some browsers, typeof returns "function" for HTML elements + // (i.e., `typeof document.createElement( "object" ) === "function"`). + // We don't want to classify *any* DOM node as a function. + return typeof obj === "function" && typeof obj.nodeType !== "number"; + }; + + +var isWindow = function isWindow( obj ) { + return obj != null && obj === obj.window; + }; + + +var document = window.document; + + + + var preservedScriptAttributes = { + type: true, + src: true, + nonce: true, + noModule: true + }; + + function DOMEval( code, node, doc ) { + doc = doc || document; + + var i, val, + script = doc.createElement( "script" ); + + script.text = code; + if ( node ) { + for ( i in preservedScriptAttributes ) { + + // Support: Firefox 64+, Edge 18+ + // Some browsers don't support the "nonce" property on scripts. + // On the other hand, just using `getAttribute` is not enough as + // the `nonce` attribute is reset to an empty string whenever it + // becomes browsing-context connected. + // See https://github.com/whatwg/html/issues/2369 + // See https://html.spec.whatwg.org/#nonce-attributes + // The `node.getAttribute` check was added for the sake of + // `jQuery.globalEval` so that it can fake a nonce-containing node + // via an object. + val = node[ i ] || node.getAttribute && node.getAttribute( i ); + if ( val ) { + script.setAttribute( i, val ); + } + } + } + doc.head.appendChild( script ).parentNode.removeChild( script ); + } + + +function toType( obj ) { + if ( obj == null ) { + return obj + ""; + } + + // Support: Android <=2.3 only (functionish RegExp) + return typeof obj === "object" || typeof obj === "function" ? + class2type[ toString.call( obj ) ] || "object" : + typeof obj; +} +/* global Symbol */ +// Defining this global in .eslintrc.json would create a danger of using the global +// unguarded in another place, it seems safer to define global only for this module + + + +var + version = "3.5.1", + + // Define a local copy of jQuery + jQuery = function( selector, context ) { + + // The jQuery object is actually just the init constructor 'enhanced' + // Need init if jQuery is called (just allow error to be thrown if not included) + return new jQuery.fn.init( selector, context ); + }; + +jQuery.fn = jQuery.prototype = { + + // The current version of jQuery being used + jquery: version, + + constructor: jQuery, + + // The default length of a jQuery object is 0 + length: 0, + + toArray: function() { + return slice.call( this ); + }, + + // Get the Nth element in the matched element set OR + // Get the whole matched element set as a clean array + get: function( num ) { + + // Return all the elements in a clean array + if ( num == null ) { + return slice.call( this ); + } + + // Return just the one element from the set + return num < 0 ? this[ num + this.length ] : this[ num ]; + }, + + // Take an array of elements and push it onto the stack + // (returning the new matched element set) + pushStack: function( elems ) { + + // Build a new jQuery matched element set + var ret = jQuery.merge( this.constructor(), elems ); + + // Add the old object onto the stack (as a reference) + ret.prevObject = this; + + // Return the newly-formed element set + return ret; + }, + + // Execute a callback for every element in the matched set. + each: function( callback ) { + return jQuery.each( this, callback ); + }, + + map: function( callback ) { + return this.pushStack( jQuery.map( this, function( elem, i ) { + return callback.call( elem, i, elem ); + } ) ); + }, + + slice: function() { + return this.pushStack( slice.apply( this, arguments ) ); + }, + + first: function() { + return this.eq( 0 ); + }, + + last: function() { + return this.eq( -1 ); + }, + + even: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return ( i + 1 ) % 2; + } ) ); + }, + + odd: function() { + return this.pushStack( jQuery.grep( this, function( _elem, i ) { + return i % 2; + } ) ); + }, + + eq: function( i ) { + var len = this.length, + j = +i + ( i < 0 ? len : 0 ); + return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); + }, + + end: function() { + return this.prevObject || this.constructor(); + }, + + // For internal use only. + // Behaves like an Array's method, not like a jQuery method. + push: push, + sort: arr.sort, + splice: arr.splice +}; + +jQuery.extend = jQuery.fn.extend = function() { + var options, name, src, copy, copyIsArray, clone, + target = arguments[ 0 ] || {}, + i = 1, + length = arguments.length, + deep = false; + + // Handle a deep copy situation + if ( typeof target === "boolean" ) { + deep = target; + + // Skip the boolean and the target + target = arguments[ i ] || {}; + i++; + } + + // Handle case when target is a string or something (possible in deep copy) + if ( typeof target !== "object" && !isFunction( target ) ) { + target = {}; + } + + // Extend jQuery itself if only one argument is passed + if ( i === length ) { + target = this; + i--; + } + + for ( ; i < length; i++ ) { + + // Only deal with non-null/undefined values + if ( ( options = arguments[ i ] ) != null ) { + + // Extend the base object + for ( name in options ) { + copy = options[ name ]; + + // Prevent Object.prototype pollution + // Prevent never-ending loop + if ( name === "__proto__" || target === copy ) { + continue; + } + + // Recurse if we're merging plain objects or arrays + if ( deep && copy && ( jQuery.isPlainObject( copy ) || + ( copyIsArray = Array.isArray( copy ) ) ) ) { + src = target[ name ]; + + // Ensure proper type for the source value + if ( copyIsArray && !Array.isArray( src ) ) { + clone = []; + } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { + clone = {}; + } else { + clone = src; + } + copyIsArray = false; + + // Never move original objects, clone them + target[ name ] = jQuery.extend( deep, clone, copy ); + + // Don't bring in undefined values + } else if ( copy !== undefined ) { + target[ name ] = copy; + } + } + } + } + + // Return the modified object + return target; +}; + +jQuery.extend( { + + // Unique for each copy of jQuery on the page + expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), + + // Assume jQuery is ready without the ready module + isReady: true, + + error: function( msg ) { + throw new Error( msg ); + }, + + noop: function() {}, + + isPlainObject: function( obj ) { + var proto, Ctor; + + // Detect obvious negatives + // Use toString instead of jQuery.type to catch host objects + if ( !obj || toString.call( obj ) !== "[object Object]" ) { + return false; + } + + proto = getProto( obj ); + + // Objects with no prototype (e.g., `Object.create( null )`) are plain + if ( !proto ) { + return true; + } + + // Objects with prototype are plain iff they were constructed by a global Object function + Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; + return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; + }, + + isEmptyObject: function( obj ) { + var name; + + for ( name in obj ) { + return false; + } + return true; + }, + + // Evaluates a script in a provided context; falls back to the global one + // if not specified. + globalEval: function( code, options, doc ) { + DOMEval( code, { nonce: options && options.nonce }, doc ); + }, + + each: function( obj, callback ) { + var length, i = 0; + + if ( isArrayLike( obj ) ) { + length = obj.length; + for ( ; i < length; i++ ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } else { + for ( i in obj ) { + if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { + break; + } + } + } + + return obj; + }, + + // results is for internal usage only + makeArray: function( arr, results ) { + var ret = results || []; + + if ( arr != null ) { + if ( isArrayLike( Object( arr ) ) ) { + jQuery.merge( ret, + typeof arr === "string" ? + [ arr ] : arr + ); + } else { + push.call( ret, arr ); + } + } + + return ret; + }, + + inArray: function( elem, arr, i ) { + return arr == null ? -1 : indexOf.call( arr, elem, i ); + }, + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + merge: function( first, second ) { + var len = +second.length, + j = 0, + i = first.length; + + for ( ; j < len; j++ ) { + first[ i++ ] = second[ j ]; + } + + first.length = i; + + return first; + }, + + grep: function( elems, callback, invert ) { + var callbackInverse, + matches = [], + i = 0, + length = elems.length, + callbackExpect = !invert; + + // Go through the array, only saving the items + // that pass the validator function + for ( ; i < length; i++ ) { + callbackInverse = !callback( elems[ i ], i ); + if ( callbackInverse !== callbackExpect ) { + matches.push( elems[ i ] ); + } + } + + return matches; + }, + + // arg is for internal usage only + map: function( elems, callback, arg ) { + var length, value, + i = 0, + ret = []; + + // Go through the array, translating each of the items to their new values + if ( isArrayLike( elems ) ) { + length = elems.length; + for ( ; i < length; i++ ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + + // Go through every key on the object, + } else { + for ( i in elems ) { + value = callback( elems[ i ], i, arg ); + + if ( value != null ) { + ret.push( value ); + } + } + } + + // Flatten any nested arrays + return flat( ret ); + }, + + // A global GUID counter for objects + guid: 1, + + // jQuery.support is not used in Core but other projects attach their + // properties to it so it needs to exist. + support: support +} ); + +if ( typeof Symbol === "function" ) { + jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; +} + +// Populate the class2type map +jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), +function( _i, name ) { + class2type[ "[object " + name + "]" ] = name.toLowerCase(); +} ); + +function isArrayLike( obj ) { + + // Support: real iOS 8.2 only (not reproducible in simulator) + // `in` check used to prevent JIT error (gh-2145) + // hasOwn isn't used here due to false negatives + // regarding Nodelist length in IE + var length = !!obj && "length" in obj && obj.length, + type = toType( obj ); + + if ( isFunction( obj ) || isWindow( obj ) ) { + return false; + } + + return type === "array" || length === 0 || + typeof length === "number" && length > 0 && ( length - 1 ) in obj; +} +var Sizzle = +/*! + * Sizzle CSS Selector Engine v2.3.5 + * https://sizzlejs.com/ + * + * Copyright JS Foundation and other contributors + * Released under the MIT license + * https://js.foundation/ + * + * Date: 2020-03-14 + */ +( function( window ) { +var i, + support, + Expr, + getText, + isXML, + tokenize, + compile, + select, + outermostContext, + sortInput, + hasDuplicate, + + // Local document vars + setDocument, + document, + docElem, + documentIsHTML, + rbuggyQSA, + rbuggyMatches, + matches, + contains, + + // Instance-specific data + expando = "sizzle" + 1 * new Date(), + preferredDoc = window.document, + dirruns = 0, + done = 0, + classCache = createCache(), + tokenCache = createCache(), + compilerCache = createCache(), + nonnativeSelectorCache = createCache(), + sortOrder = function( a, b ) { + if ( a === b ) { + hasDuplicate = true; + } + return 0; + }, + + // Instance methods + hasOwn = ( {} ).hasOwnProperty, + arr = [], + pop = arr.pop, + pushNative = arr.push, + push = arr.push, + slice = arr.slice, + + // Use a stripped-down indexOf as it's faster than native + // https://jsperf.com/thor-indexof-vs-for/5 + indexOf = function( list, elem ) { + var i = 0, + len = list.length; + for ( ; i < len; i++ ) { + if ( list[ i ] === elem ) { + return i; + } + } + return -1; + }, + + booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + + "ismap|loop|multiple|open|readonly|required|scoped", + + // Regular expressions + + // http://www.w3.org/TR/css3-selectors/#whitespace + whitespace = "[\\x20\\t\\r\\n\\f]", + + // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram + identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + + "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", + + // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors + attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + + + // Operator (capture 2) + "*([*^$|!~]?=)" + whitespace + + + // "Attribute values must be CSS identifiers [capture 5] + // or strings [capture 3 or capture 4]" + "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + + whitespace + "*\\]", + + pseudos = ":(" + identifier + ")(?:\\((" + + + // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: + // 1. quoted (capture 3; capture 4 or capture 5) + "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + + + // 2. simple (capture 6) + "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + + + // 3. anything else (capture 2) + ".*" + + ")\\)|)", + + // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter + rwhitespace = new RegExp( whitespace + "+", "g" ), + rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + + whitespace + "+$", "g" ), + + rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), + rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + + "*" ), + rdescend = new RegExp( whitespace + "|>" ), + + rpseudo = new RegExp( pseudos ), + ridentifier = new RegExp( "^" + identifier + "$" ), + + matchExpr = { + "ID": new RegExp( "^#(" + identifier + ")" ), + "CLASS": new RegExp( "^\\.(" + identifier + ")" ), + "TAG": new RegExp( "^(" + identifier + "|[*])" ), + "ATTR": new RegExp( "^" + attributes ), + "PSEUDO": new RegExp( "^" + pseudos ), + "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + + whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + + whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), + "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), + + // For use in libraries implementing .is() + // We use this for POS matching in `select` + "needsContext": new RegExp( "^" + whitespace + + "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + + "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) + }, + + rhtml = /HTML$/i, + rinputs = /^(?:input|select|textarea|button)$/i, + rheader = /^h\d$/i, + + rnative = /^[^{]+\{\s*\[native \w/, + + // Easily-parseable/retrievable ID or TAG or CLASS selectors + rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, + + rsibling = /[+~]/, + + // CSS escapes + // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters + runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), + funescape = function( escape, nonHex ) { + var high = "0x" + escape.slice( 1 ) - 0x10000; + + return nonHex ? + + // Strip the backslash prefix from a non-hex escape sequence + nonHex : + + // Replace a hexadecimal escape sequence with the encoded Unicode code point + // Support: IE <=11+ + // For values outside the Basic Multilingual Plane (BMP), manually construct a + // surrogate pair + high < 0 ? + String.fromCharCode( high + 0x10000 ) : + String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); + }, + + // CSS string/identifier serialization + // https://drafts.csswg.org/cssom/#common-serializing-idioms + rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, + fcssescape = function( ch, asCodePoint ) { + if ( asCodePoint ) { + + // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER + if ( ch === "\0" ) { + return "\uFFFD"; + } + + // Control characters and (dependent upon position) numbers get escaped as code points + return ch.slice( 0, -1 ) + "\\" + + ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; + } + + // Other potentially-special ASCII characters get backslash-escaped + return "\\" + ch; + }, + + // Used for iframes + // See setDocument() + // Removing the function wrapper causes a "Permission Denied" + // error in IE + unloadHandler = function() { + setDocument(); + }, + + inDisabledFieldset = addCombinator( + function( elem ) { + return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; + }, + { dir: "parentNode", next: "legend" } + ); + +// Optimize for push.apply( _, NodeList ) +try { + push.apply( + ( arr = slice.call( preferredDoc.childNodes ) ), + preferredDoc.childNodes + ); + + // Support: Android<4.0 + // Detect silently failing push.apply + // eslint-disable-next-line no-unused-expressions + arr[ preferredDoc.childNodes.length ].nodeType; +} catch ( e ) { + push = { apply: arr.length ? + + // Leverage slice if possible + function( target, els ) { + pushNative.apply( target, slice.call( els ) ); + } : + + // Support: IE<9 + // Otherwise append directly + function( target, els ) { + var j = target.length, + i = 0; + + // Can't trust NodeList.length + while ( ( target[ j++ ] = els[ i++ ] ) ) {} + target.length = j - 1; + } + }; +} + +function Sizzle( selector, context, results, seed ) { + var m, i, elem, nid, match, groups, newSelector, + newContext = context && context.ownerDocument, + + // nodeType defaults to 9, since context defaults to document + nodeType = context ? context.nodeType : 9; + + results = results || []; + + // Return early from calls with invalid selector or context + if ( typeof selector !== "string" || !selector || + nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { + + return results; + } + + // Try to shortcut find operations (as opposed to filters) in HTML documents + if ( !seed ) { + setDocument( context ); + context = context || document; + + if ( documentIsHTML ) { + + // If the selector is sufficiently simple, try using a "get*By*" DOM method + // (excepting DocumentFragment context, where the methods don't exist) + if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { + + // ID selector + if ( ( m = match[ 1 ] ) ) { + + // Document context + if ( nodeType === 9 ) { + if ( ( elem = context.getElementById( m ) ) ) { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( elem.id === m ) { + results.push( elem ); + return results; + } + } else { + return results; + } + + // Element context + } else { + + // Support: IE, Opera, Webkit + // TODO: identify versions + // getElementById can match elements by name instead of ID + if ( newContext && ( elem = newContext.getElementById( m ) ) && + contains( context, elem ) && + elem.id === m ) { + + results.push( elem ); + return results; + } + } + + // Type selector + } else if ( match[ 2 ] ) { + push.apply( results, context.getElementsByTagName( selector ) ); + return results; + + // Class selector + } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && + context.getElementsByClassName ) { + + push.apply( results, context.getElementsByClassName( m ) ); + return results; + } + } + + // Take advantage of querySelectorAll + if ( support.qsa && + !nonnativeSelectorCache[ selector + " " ] && + ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && + + // Support: IE 8 only + // Exclude object elements + ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { + + newSelector = selector; + newContext = context; + + // qSA considers elements outside a scoping root when evaluating child or + // descendant combinators, which is not what we want. + // In such cases, we work around the behavior by prefixing every selector in the + // list with an ID selector referencing the scope context. + // The technique has to be used as well when a leading combinator is used + // as such selectors are not recognized by querySelectorAll. + // Thanks to Andrew Dupont for this technique. + if ( nodeType === 1 && + ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { + + // Expand context for sibling selectors + newContext = rsibling.test( selector ) && testContext( context.parentNode ) || + context; + + // We can use :scope instead of the ID hack if the browser + // supports it & if we're not changing the context. + if ( newContext !== context || !support.scope ) { + + // Capture the context ID, setting it first if necessary + if ( ( nid = context.getAttribute( "id" ) ) ) { + nid = nid.replace( rcssescape, fcssescape ); + } else { + context.setAttribute( "id", ( nid = expando ) ); + } + } + + // Prefix every selector in the list + groups = tokenize( selector ); + i = groups.length; + while ( i-- ) { + groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + + toSelector( groups[ i ] ); + } + newSelector = groups.join( "," ); + } + + try { + push.apply( results, + newContext.querySelectorAll( newSelector ) + ); + return results; + } catch ( qsaError ) { + nonnativeSelectorCache( selector, true ); + } finally { + if ( nid === expando ) { + context.removeAttribute( "id" ); + } + } + } + } + } + + // All others + return select( selector.replace( rtrim, "$1" ), context, results, seed ); +} + +/** + * Create key-value caches of limited size + * @returns {function(string, object)} Returns the Object data after storing it on itself with + * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) + * deleting the oldest entry + */ +function createCache() { + var keys = []; + + function cache( key, value ) { + + // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) + if ( keys.push( key + " " ) > Expr.cacheLength ) { + + // Only keep the most recent entries + delete cache[ keys.shift() ]; + } + return ( cache[ key + " " ] = value ); + } + return cache; +} + +/** + * Mark a function for special use by Sizzle + * @param {Function} fn The function to mark + */ +function markFunction( fn ) { + fn[ expando ] = true; + return fn; +} + +/** + * Support testing using an element + * @param {Function} fn Passed the created element and returns a boolean result + */ +function assert( fn ) { + var el = document.createElement( "fieldset" ); + + try { + return !!fn( el ); + } catch ( e ) { + return false; + } finally { + + // Remove from its parent by default + if ( el.parentNode ) { + el.parentNode.removeChild( el ); + } + + // release memory in IE + el = null; + } +} + +/** + * Adds the same handler for all of the specified attrs + * @param {String} attrs Pipe-separated list of attributes + * @param {Function} handler The method that will be applied + */ +function addHandle( attrs, handler ) { + var arr = attrs.split( "|" ), + i = arr.length; + + while ( i-- ) { + Expr.attrHandle[ arr[ i ] ] = handler; + } +} + +/** + * Checks document order of two siblings + * @param {Element} a + * @param {Element} b + * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b + */ +function siblingCheck( a, b ) { + var cur = b && a, + diff = cur && a.nodeType === 1 && b.nodeType === 1 && + a.sourceIndex - b.sourceIndex; + + // Use IE sourceIndex if available on both nodes + if ( diff ) { + return diff; + } + + // Check if b follows a + if ( cur ) { + while ( ( cur = cur.nextSibling ) ) { + if ( cur === b ) { + return -1; + } + } + } + + return a ? 1 : -1; +} + +/** + * Returns a function to use in pseudos for input types + * @param {String} type + */ +function createInputPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for buttons + * @param {String} type + */ +function createButtonPseudo( type ) { + return function( elem ) { + var name = elem.nodeName.toLowerCase(); + return ( name === "input" || name === "button" ) && elem.type === type; + }; +} + +/** + * Returns a function to use in pseudos for :enabled/:disabled + * @param {Boolean} disabled true for :disabled; false for :enabled + */ +function createDisabledPseudo( disabled ) { + + // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable + return function( elem ) { + + // Only certain elements can match :enabled or :disabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled + // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled + if ( "form" in elem ) { + + // Check for inherited disabledness on relevant non-disabled elements: + // * listed form-associated elements in a disabled fieldset + // https://html.spec.whatwg.org/multipage/forms.html#category-listed + // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled + // * option elements in a disabled optgroup + // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled + // All such elements have a "form" property. + if ( elem.parentNode && elem.disabled === false ) { + + // Option elements defer to a parent optgroup if present + if ( "label" in elem ) { + if ( "label" in elem.parentNode ) { + return elem.parentNode.disabled === disabled; + } else { + return elem.disabled === disabled; + } + } + + // Support: IE 6 - 11 + // Use the isDisabled shortcut property to check for disabled fieldset ancestors + return elem.isDisabled === disabled || + + // Where there is no isDisabled, check manually + /* jshint -W018 */ + elem.isDisabled !== !disabled && + inDisabledFieldset( elem ) === disabled; + } + + return elem.disabled === disabled; + + // Try to winnow out elements that can't be disabled before trusting the disabled property. + // Some victims get caught in our net (label, legend, menu, track), but it shouldn't + // even exist on them, let alone have a boolean value. + } else if ( "label" in elem ) { + return elem.disabled === disabled; + } + + // Remaining elements are neither :enabled nor :disabled + return false; + }; +} + +/** + * Returns a function to use in pseudos for positionals + * @param {Function} fn + */ +function createPositionalPseudo( fn ) { + return markFunction( function( argument ) { + argument = +argument; + return markFunction( function( seed, matches ) { + var j, + matchIndexes = fn( [], seed.length, argument ), + i = matchIndexes.length; + + // Match elements found at the specified indexes + while ( i-- ) { + if ( seed[ ( j = matchIndexes[ i ] ) ] ) { + seed[ j ] = !( matches[ j ] = seed[ j ] ); + } + } + } ); + } ); +} + +/** + * Checks a node for validity as a Sizzle context + * @param {Element|Object=} context + * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value + */ +function testContext( context ) { + return context && typeof context.getElementsByTagName !== "undefined" && context; +} + +// Expose support vars for convenience +support = Sizzle.support = {}; + +/** + * Detects XML nodes + * @param {Element|Object} elem An element or a document + * @returns {Boolean} True iff elem is a non-HTML XML node + */ +isXML = Sizzle.isXML = function( elem ) { + var namespace = elem.namespaceURI, + docElem = ( elem.ownerDocument || elem ).documentElement; + + // Support: IE <=8 + // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes + // https://bugs.jquery.com/ticket/4833 + return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); +}; + +/** + * Sets document-related variables once based on the current document + * @param {Element|Object} [doc] An element or document object to use to set the document + * @returns {Object} Returns the current document + */ +setDocument = Sizzle.setDocument = function( node ) { + var hasCompare, subWindow, + doc = node ? node.ownerDocument || node : preferredDoc; + + // Return early if doc is invalid or already selected + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { + return document; + } + + // Update global variables + document = doc; + docElem = document.documentElement; + documentIsHTML = !isXML( document ); + + // Support: IE 9 - 11+, Edge 12 - 18+ + // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( preferredDoc != document && + ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { + + // Support: IE 11, Edge + if ( subWindow.addEventListener ) { + subWindow.addEventListener( "unload", unloadHandler, false ); + + // Support: IE 9 - 10 only + } else if ( subWindow.attachEvent ) { + subWindow.attachEvent( "onunload", unloadHandler ); + } + } + + // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, + // Safari 4 - 5 only, Opera <=11.6 - 12.x only + // IE/Edge & older browsers don't support the :scope pseudo-class. + // Support: Safari 6.0 only + // Safari 6.0 supports :scope but it's an alias of :root there. + support.scope = assert( function( el ) { + docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); + return typeof el.querySelectorAll !== "undefined" && + !el.querySelectorAll( ":scope fieldset div" ).length; + } ); + + /* Attributes + ---------------------------------------------------------------------- */ + + // Support: IE<8 + // Verify that getAttribute really returns attributes and not properties + // (excepting IE8 booleans) + support.attributes = assert( function( el ) { + el.className = "i"; + return !el.getAttribute( "className" ); + } ); + + /* getElement(s)By* + ---------------------------------------------------------------------- */ + + // Check if getElementsByTagName("*") returns only elements + support.getElementsByTagName = assert( function( el ) { + el.appendChild( document.createComment( "" ) ); + return !el.getElementsByTagName( "*" ).length; + } ); + + // Support: IE<9 + support.getElementsByClassName = rnative.test( document.getElementsByClassName ); + + // Support: IE<10 + // Check if getElementById returns elements by name + // The broken getElementById methods don't pick up programmatically-set names, + // so use a roundabout getElementsByName test + support.getById = assert( function( el ) { + docElem.appendChild( el ).id = expando; + return !document.getElementsByName || !document.getElementsByName( expando ).length; + } ); + + // ID filter and find + if ( support.getById ) { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + return elem.getAttribute( "id" ) === attrId; + }; + }; + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var elem = context.getElementById( id ); + return elem ? [ elem ] : []; + } + }; + } else { + Expr.filter[ "ID" ] = function( id ) { + var attrId = id.replace( runescape, funescape ); + return function( elem ) { + var node = typeof elem.getAttributeNode !== "undefined" && + elem.getAttributeNode( "id" ); + return node && node.value === attrId; + }; + }; + + // Support: IE 6 - 7 only + // getElementById is not reliable as a find shortcut + Expr.find[ "ID" ] = function( id, context ) { + if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { + var node, i, elems, + elem = context.getElementById( id ); + + if ( elem ) { + + // Verify the id attribute + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + + // Fall back on getElementsByName + elems = context.getElementsByName( id ); + i = 0; + while ( ( elem = elems[ i++ ] ) ) { + node = elem.getAttributeNode( "id" ); + if ( node && node.value === id ) { + return [ elem ]; + } + } + } + + return []; + } + }; + } + + // Tag + Expr.find[ "TAG" ] = support.getElementsByTagName ? + function( tag, context ) { + if ( typeof context.getElementsByTagName !== "undefined" ) { + return context.getElementsByTagName( tag ); + + // DocumentFragment nodes don't have gEBTN + } else if ( support.qsa ) { + return context.querySelectorAll( tag ); + } + } : + + function( tag, context ) { + var elem, + tmp = [], + i = 0, + + // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too + results = context.getElementsByTagName( tag ); + + // Filter out possible comments + if ( tag === "*" ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem.nodeType === 1 ) { + tmp.push( elem ); + } + } + + return tmp; + } + return results; + }; + + // Class + Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { + if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { + return context.getElementsByClassName( className ); + } + }; + + /* QSA/matchesSelector + ---------------------------------------------------------------------- */ + + // QSA and matchesSelector support + + // matchesSelector(:active) reports false when true (IE9/Opera 11.5) + rbuggyMatches = []; + + // qSa(:focus) reports false when true (Chrome 21) + // We allow this because of a bug in IE8/9 that throws an error + // whenever `document.activeElement` is accessed on an iframe + // So, we allow :focus to pass through QSA all the time to avoid the IE error + // See https://bugs.jquery.com/ticket/13378 + rbuggyQSA = []; + + if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { + + // Build QSA regex + // Regex strategy adopted from Diego Perini + assert( function( el ) { + + var input; + + // Select is set to empty string on purpose + // This is to test IE's treatment of not explicitly + // setting a boolean content attribute, + // since its presence should be enough + // https://bugs.jquery.com/ticket/12359 + docElem.appendChild( el ).innerHTML = "" + + ""; + + // Support: IE8, Opera 11-12.16 + // Nothing should be selected when empty strings follow ^= or $= or *= + // The test attribute must be unknown in Opera but "safe" for WinRT + // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section + if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { + rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); + } + + // Support: IE8 + // Boolean attributes and "value" are not treated correctly + if ( !el.querySelectorAll( "[selected]" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); + } + + // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ + if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { + rbuggyQSA.push( "~=" ); + } + + // Support: IE 11+, Edge 15 - 18+ + // IE 11/Edge don't find elements on a `[name='']` query in some cases. + // Adding a temporary attribute to the document before the selection works + // around the issue. + // Interestingly, IE 10 & older don't seem to have the issue. + input = document.createElement( "input" ); + input.setAttribute( "name", "" ); + el.appendChild( input ); + if ( !el.querySelectorAll( "[name='']" ).length ) { + rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + + whitespace + "*(?:''|\"\")" ); + } + + // Webkit/Opera - :checked should return selected option elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + // IE8 throws error here and will not see later tests + if ( !el.querySelectorAll( ":checked" ).length ) { + rbuggyQSA.push( ":checked" ); + } + + // Support: Safari 8+, iOS 8+ + // https://bugs.webkit.org/show_bug.cgi?id=136851 + // In-page `selector#id sibling-combinator selector` fails + if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { + rbuggyQSA.push( ".#.+[+~]" ); + } + + // Support: Firefox <=3.6 - 5 only + // Old Firefox doesn't throw on a badly-escaped identifier. + el.querySelectorAll( "\\\f" ); + rbuggyQSA.push( "[\\r\\n\\f]" ); + } ); + + assert( function( el ) { + el.innerHTML = "" + + ""; + + // Support: Windows 8 Native Apps + // The type and name attributes are restricted during .innerHTML assignment + var input = document.createElement( "input" ); + input.setAttribute( "type", "hidden" ); + el.appendChild( input ).setAttribute( "name", "D" ); + + // Support: IE8 + // Enforce case-sensitivity of name attribute + if ( el.querySelectorAll( "[name=d]" ).length ) { + rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); + } + + // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) + // IE8 throws error here and will not see later tests + if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: IE9-11+ + // IE's :disabled selector does not pick up the children of disabled fieldsets + docElem.appendChild( el ).disabled = true; + if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { + rbuggyQSA.push( ":enabled", ":disabled" ); + } + + // Support: Opera 10 - 11 only + // Opera 10-11 does not throw on post-comma invalid pseudos + el.querySelectorAll( "*,:x" ); + rbuggyQSA.push( ",.*:" ); + } ); + } + + if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || + docElem.webkitMatchesSelector || + docElem.mozMatchesSelector || + docElem.oMatchesSelector || + docElem.msMatchesSelector ) ) ) ) { + + assert( function( el ) { + + // Check to see if it's possible to do matchesSelector + // on a disconnected node (IE 9) + support.disconnectedMatch = matches.call( el, "*" ); + + // This should fail with an exception + // Gecko does not error, returns false instead + matches.call( el, "[s!='']:x" ); + rbuggyMatches.push( "!=", pseudos ); + } ); + } + + rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); + rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); + + /* Contains + ---------------------------------------------------------------------- */ + hasCompare = rnative.test( docElem.compareDocumentPosition ); + + // Element contains another + // Purposefully self-exclusive + // As in, an element does not contain itself + contains = hasCompare || rnative.test( docElem.contains ) ? + function( a, b ) { + var adown = a.nodeType === 9 ? a.documentElement : a, + bup = b && b.parentNode; + return a === bup || !!( bup && bup.nodeType === 1 && ( + adown.contains ? + adown.contains( bup ) : + a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 + ) ); + } : + function( a, b ) { + if ( b ) { + while ( ( b = b.parentNode ) ) { + if ( b === a ) { + return true; + } + } + } + return false; + }; + + /* Sorting + ---------------------------------------------------------------------- */ + + // Document order sorting + sortOrder = hasCompare ? + function( a, b ) { + + // Flag for duplicate removal + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + // Sort on method existence if only one input has compareDocumentPosition + var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; + if ( compare ) { + return compare; + } + + // Calculate position if both inputs belong to the same document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? + a.compareDocumentPosition( b ) : + + // Otherwise we know they are disconnected + 1; + + // Disconnected nodes + if ( compare & 1 || + ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { + + // Choose the first element that is related to our preferred document + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( a == document || a.ownerDocument == preferredDoc && + contains( preferredDoc, a ) ) { + return -1; + } + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( b == document || b.ownerDocument == preferredDoc && + contains( preferredDoc, b ) ) { + return 1; + } + + // Maintain original order + return sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + } + + return compare & 4 ? -1 : 1; + } : + function( a, b ) { + + // Exit early if the nodes are identical + if ( a === b ) { + hasDuplicate = true; + return 0; + } + + var cur, + i = 0, + aup = a.parentNode, + bup = b.parentNode, + ap = [ a ], + bp = [ b ]; + + // Parentless nodes are either documents or disconnected + if ( !aup || !bup ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + return a == document ? -1 : + b == document ? 1 : + /* eslint-enable eqeqeq */ + aup ? -1 : + bup ? 1 : + sortInput ? + ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : + 0; + + // If the nodes are siblings, we can do a quick check + } else if ( aup === bup ) { + return siblingCheck( a, b ); + } + + // Otherwise we need full lists of their ancestors for comparison + cur = a; + while ( ( cur = cur.parentNode ) ) { + ap.unshift( cur ); + } + cur = b; + while ( ( cur = cur.parentNode ) ) { + bp.unshift( cur ); + } + + // Walk down the tree looking for a discrepancy + while ( ap[ i ] === bp[ i ] ) { + i++; + } + + return i ? + + // Do a sibling check if the nodes have a common ancestor + siblingCheck( ap[ i ], bp[ i ] ) : + + // Otherwise nodes in our document sort first + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + /* eslint-disable eqeqeq */ + ap[ i ] == preferredDoc ? -1 : + bp[ i ] == preferredDoc ? 1 : + /* eslint-enable eqeqeq */ + 0; + }; + + return document; +}; + +Sizzle.matches = function( expr, elements ) { + return Sizzle( expr, null, null, elements ); +}; + +Sizzle.matchesSelector = function( elem, expr ) { + setDocument( elem ); + + if ( support.matchesSelector && documentIsHTML && + !nonnativeSelectorCache[ expr + " " ] && + ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && + ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { + + try { + var ret = matches.call( elem, expr ); + + // IE 9's matchesSelector returns false on disconnected nodes + if ( ret || support.disconnectedMatch || + + // As well, disconnected nodes are said to be in a document + // fragment in IE 9 + elem.document && elem.document.nodeType !== 11 ) { + return ret; + } + } catch ( e ) { + nonnativeSelectorCache( expr, true ); + } + } + + return Sizzle( expr, document, null, [ elem ] ).length > 0; +}; + +Sizzle.contains = function( context, elem ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( context.ownerDocument || context ) != document ) { + setDocument( context ); + } + return contains( context, elem ); +}; + +Sizzle.attr = function( elem, name ) { + + // Set document vars if needed + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( ( elem.ownerDocument || elem ) != document ) { + setDocument( elem ); + } + + var fn = Expr.attrHandle[ name.toLowerCase() ], + + // Don't get fooled by Object.prototype properties (jQuery #13807) + val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? + fn( elem, name, !documentIsHTML ) : + undefined; + + return val !== undefined ? + val : + support.attributes || !documentIsHTML ? + elem.getAttribute( name ) : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; +}; + +Sizzle.escape = function( sel ) { + return ( sel + "" ).replace( rcssescape, fcssescape ); +}; + +Sizzle.error = function( msg ) { + throw new Error( "Syntax error, unrecognized expression: " + msg ); +}; + +/** + * Document sorting and removing duplicates + * @param {ArrayLike} results + */ +Sizzle.uniqueSort = function( results ) { + var elem, + duplicates = [], + j = 0, + i = 0; + + // Unless we *know* we can detect duplicates, assume their presence + hasDuplicate = !support.detectDuplicates; + sortInput = !support.sortStable && results.slice( 0 ); + results.sort( sortOrder ); + + if ( hasDuplicate ) { + while ( ( elem = results[ i++ ] ) ) { + if ( elem === results[ i ] ) { + j = duplicates.push( i ); + } + } + while ( j-- ) { + results.splice( duplicates[ j ], 1 ); + } + } + + // Clear input after sorting to release objects + // See https://github.com/jquery/sizzle/pull/225 + sortInput = null; + + return results; +}; + +/** + * Utility function for retrieving the text value of an array of DOM nodes + * @param {Array|Element} elem + */ +getText = Sizzle.getText = function( elem ) { + var node, + ret = "", + i = 0, + nodeType = elem.nodeType; + + if ( !nodeType ) { + + // If no nodeType, this is expected to be an array + while ( ( node = elem[ i++ ] ) ) { + + // Do not traverse comment nodes + ret += getText( node ); + } + } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { + + // Use textContent for elements + // innerText usage removed for consistency of new lines (jQuery #11153) + if ( typeof elem.textContent === "string" ) { + return elem.textContent; + } else { + + // Traverse its children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + ret += getText( elem ); + } + } + } else if ( nodeType === 3 || nodeType === 4 ) { + return elem.nodeValue; + } + + // Do not include comment or processing instruction nodes + + return ret; +}; + +Expr = Sizzle.selectors = { + + // Can be adjusted by the user + cacheLength: 50, + + createPseudo: markFunction, + + match: matchExpr, + + attrHandle: {}, + + find: {}, + + relative: { + ">": { dir: "parentNode", first: true }, + " ": { dir: "parentNode" }, + "+": { dir: "previousSibling", first: true }, + "~": { dir: "previousSibling" } + }, + + preFilter: { + "ATTR": function( match ) { + match[ 1 ] = match[ 1 ].replace( runescape, funescape ); + + // Move the given value to match[3] whether quoted or unquoted + match[ 3 ] = ( match[ 3 ] || match[ 4 ] || + match[ 5 ] || "" ).replace( runescape, funescape ); + + if ( match[ 2 ] === "~=" ) { + match[ 3 ] = " " + match[ 3 ] + " "; + } + + return match.slice( 0, 4 ); + }, + + "CHILD": function( match ) { + + /* matches from matchExpr["CHILD"] + 1 type (only|nth|...) + 2 what (child|of-type) + 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) + 4 xn-component of xn+y argument ([+-]?\d*n|) + 5 sign of xn-component + 6 x of xn-component + 7 sign of y-component + 8 y of y-component + */ + match[ 1 ] = match[ 1 ].toLowerCase(); + + if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { + + // nth-* requires argument + if ( !match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + // numeric x and y parameters for Expr.filter.CHILD + // remember that false/true cast respectively to 0/1 + match[ 4 ] = +( match[ 4 ] ? + match[ 5 ] + ( match[ 6 ] || 1 ) : + 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); + match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); + + // other types prohibit arguments + } else if ( match[ 3 ] ) { + Sizzle.error( match[ 0 ] ); + } + + return match; + }, + + "PSEUDO": function( match ) { + var excess, + unquoted = !match[ 6 ] && match[ 2 ]; + + if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { + return null; + } + + // Accept quoted arguments as-is + if ( match[ 3 ] ) { + match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; + + // Strip excess characters from unquoted arguments + } else if ( unquoted && rpseudo.test( unquoted ) && + + // Get excess from tokenize (recursively) + ( excess = tokenize( unquoted, true ) ) && + + // advance to the next closing parenthesis + ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { + + // excess is a negative index + match[ 0 ] = match[ 0 ].slice( 0, excess ); + match[ 2 ] = unquoted.slice( 0, excess ); + } + + // Return only captures needed by the pseudo filter method (type and argument) + return match.slice( 0, 3 ); + } + }, + + filter: { + + "TAG": function( nodeNameSelector ) { + var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); + return nodeNameSelector === "*" ? + function() { + return true; + } : + function( elem ) { + return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; + }; + }, + + "CLASS": function( className ) { + var pattern = classCache[ className + " " ]; + + return pattern || + ( pattern = new RegExp( "(^|" + whitespace + + ")" + className + "(" + whitespace + "|$)" ) ) && classCache( + className, function( elem ) { + return pattern.test( + typeof elem.className === "string" && elem.className || + typeof elem.getAttribute !== "undefined" && + elem.getAttribute( "class" ) || + "" + ); + } ); + }, + + "ATTR": function( name, operator, check ) { + return function( elem ) { + var result = Sizzle.attr( elem, name ); + + if ( result == null ) { + return operator === "!="; + } + if ( !operator ) { + return true; + } + + result += ""; + + /* eslint-disable max-len */ + + return operator === "=" ? result === check : + operator === "!=" ? result !== check : + operator === "^=" ? check && result.indexOf( check ) === 0 : + operator === "*=" ? check && result.indexOf( check ) > -1 : + operator === "$=" ? check && result.slice( -check.length ) === check : + operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : + operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : + false; + /* eslint-enable max-len */ + + }; + }, + + "CHILD": function( type, what, _argument, first, last ) { + var simple = type.slice( 0, 3 ) !== "nth", + forward = type.slice( -4 ) !== "last", + ofType = what === "of-type"; + + return first === 1 && last === 0 ? + + // Shortcut for :nth-*(n) + function( elem ) { + return !!elem.parentNode; + } : + + function( elem, _context, xml ) { + var cache, uniqueCache, outerCache, node, nodeIndex, start, + dir = simple !== forward ? "nextSibling" : "previousSibling", + parent = elem.parentNode, + name = ofType && elem.nodeName.toLowerCase(), + useCache = !xml && !ofType, + diff = false; + + if ( parent ) { + + // :(first|last|only)-(child|of-type) + if ( simple ) { + while ( dir ) { + node = elem; + while ( ( node = node[ dir ] ) ) { + if ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) { + + return false; + } + } + + // Reverse direction for :only-* (if we haven't yet done so) + start = dir = type === "only" && !start && "nextSibling"; + } + return true; + } + + start = [ forward ? parent.firstChild : parent.lastChild ]; + + // non-xml :nth-child(...) stores cache data on `parent` + if ( forward && useCache ) { + + // Seek `elem` from a previously-cached index + + // ...in a gzip-friendly way + node = parent; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex && cache[ 2 ]; + node = nodeIndex && parent.childNodes[ nodeIndex ]; + + while ( ( node = ++nodeIndex && node && node[ dir ] || + + // Fallback to seeking `elem` from the start + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + // When found, cache indexes on `parent` and break + if ( node.nodeType === 1 && ++diff && node === elem ) { + uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; + break; + } + } + + } else { + + // Use previously-cached element index if available + if ( useCache ) { + + // ...in a gzip-friendly way + node = elem; + outerCache = node[ expando ] || ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + cache = uniqueCache[ type ] || []; + nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; + diff = nodeIndex; + } + + // xml :nth-child(...) + // or :nth-last-child(...) or :nth(-last)?-of-type(...) + if ( diff === false ) { + + // Use the same loop as above to seek `elem` from the start + while ( ( node = ++nodeIndex && node && node[ dir ] || + ( diff = nodeIndex = 0 ) || start.pop() ) ) { + + if ( ( ofType ? + node.nodeName.toLowerCase() === name : + node.nodeType === 1 ) && + ++diff ) { + + // Cache the index of each encountered element + if ( useCache ) { + outerCache = node[ expando ] || + ( node[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ node.uniqueID ] || + ( outerCache[ node.uniqueID ] = {} ); + + uniqueCache[ type ] = [ dirruns, diff ]; + } + + if ( node === elem ) { + break; + } + } + } + } + } + + // Incorporate the offset, then check against cycle size + diff -= last; + return diff === first || ( diff % first === 0 && diff / first >= 0 ); + } + }; + }, + + "PSEUDO": function( pseudo, argument ) { + + // pseudo-class names are case-insensitive + // http://www.w3.org/TR/selectors/#pseudo-classes + // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters + // Remember that setFilters inherits from pseudos + var args, + fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || + Sizzle.error( "unsupported pseudo: " + pseudo ); + + // The user may use createPseudo to indicate that + // arguments are needed to create the filter function + // just as Sizzle does + if ( fn[ expando ] ) { + return fn( argument ); + } + + // But maintain support for old signatures + if ( fn.length > 1 ) { + args = [ pseudo, pseudo, "", argument ]; + return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? + markFunction( function( seed, matches ) { + var idx, + matched = fn( seed, argument ), + i = matched.length; + while ( i-- ) { + idx = indexOf( seed, matched[ i ] ); + seed[ idx ] = !( matches[ idx ] = matched[ i ] ); + } + } ) : + function( elem ) { + return fn( elem, 0, args ); + }; + } + + return fn; + } + }, + + pseudos: { + + // Potentially complex pseudos + "not": markFunction( function( selector ) { + + // Trim the selector passed to compile + // to avoid treating leading and trailing + // spaces as combinators + var input = [], + results = [], + matcher = compile( selector.replace( rtrim, "$1" ) ); + + return matcher[ expando ] ? + markFunction( function( seed, matches, _context, xml ) { + var elem, + unmatched = matcher( seed, null, xml, [] ), + i = seed.length; + + // Match elements unmatched by `matcher` + while ( i-- ) { + if ( ( elem = unmatched[ i ] ) ) { + seed[ i ] = !( matches[ i ] = elem ); + } + } + } ) : + function( elem, _context, xml ) { + input[ 0 ] = elem; + matcher( input, null, xml, results ); + + // Don't keep the element (issue #299) + input[ 0 ] = null; + return !results.pop(); + }; + } ), + + "has": markFunction( function( selector ) { + return function( elem ) { + return Sizzle( selector, elem ).length > 0; + }; + } ), + + "contains": markFunction( function( text ) { + text = text.replace( runescape, funescape ); + return function( elem ) { + return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; + }; + } ), + + // "Whether an element is represented by a :lang() selector + // is based solely on the element's language value + // being equal to the identifier C, + // or beginning with the identifier C immediately followed by "-". + // The matching of C against the element's language value is performed case-insensitively. + // The identifier C does not have to be a valid language name." + // http://www.w3.org/TR/selectors/#lang-pseudo + "lang": markFunction( function( lang ) { + + // lang value must be a valid identifier + if ( !ridentifier.test( lang || "" ) ) { + Sizzle.error( "unsupported lang: " + lang ); + } + lang = lang.replace( runescape, funescape ).toLowerCase(); + return function( elem ) { + var elemLang; + do { + if ( ( elemLang = documentIsHTML ? + elem.lang : + elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { + + elemLang = elemLang.toLowerCase(); + return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; + } + } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); + return false; + }; + } ), + + // Miscellaneous + "target": function( elem ) { + var hash = window.location && window.location.hash; + return hash && hash.slice( 1 ) === elem.id; + }, + + "root": function( elem ) { + return elem === docElem; + }, + + "focus": function( elem ) { + return elem === document.activeElement && + ( !document.hasFocus || document.hasFocus() ) && + !!( elem.type || elem.href || ~elem.tabIndex ); + }, + + // Boolean properties + "enabled": createDisabledPseudo( false ), + "disabled": createDisabledPseudo( true ), + + "checked": function( elem ) { + + // In CSS3, :checked should return both checked and selected elements + // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked + var nodeName = elem.nodeName.toLowerCase(); + return ( nodeName === "input" && !!elem.checked ) || + ( nodeName === "option" && !!elem.selected ); + }, + + "selected": function( elem ) { + + // Accessing this property makes selected-by-default + // options in Safari work properly + if ( elem.parentNode ) { + // eslint-disable-next-line no-unused-expressions + elem.parentNode.selectedIndex; + } + + return elem.selected === true; + }, + + // Contents + "empty": function( elem ) { + + // http://www.w3.org/TR/selectors/#empty-pseudo + // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), + // but not by others (comment: 8; processing instruction: 7; etc.) + // nodeType < 6 works because attributes (2) do not appear as children + for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { + if ( elem.nodeType < 6 ) { + return false; + } + } + return true; + }, + + "parent": function( elem ) { + return !Expr.pseudos[ "empty" ]( elem ); + }, + + // Element/input types + "header": function( elem ) { + return rheader.test( elem.nodeName ); + }, + + "input": function( elem ) { + return rinputs.test( elem.nodeName ); + }, + + "button": function( elem ) { + var name = elem.nodeName.toLowerCase(); + return name === "input" && elem.type === "button" || name === "button"; + }, + + "text": function( elem ) { + var attr; + return elem.nodeName.toLowerCase() === "input" && + elem.type === "text" && + + // Support: IE<8 + // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" + ( ( attr = elem.getAttribute( "type" ) ) == null || + attr.toLowerCase() === "text" ); + }, + + // Position-in-collection + "first": createPositionalPseudo( function() { + return [ 0 ]; + } ), + + "last": createPositionalPseudo( function( _matchIndexes, length ) { + return [ length - 1 ]; + } ), + + "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { + return [ argument < 0 ? argument + length : argument ]; + } ), + + "even": createPositionalPseudo( function( matchIndexes, length ) { + var i = 0; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "odd": createPositionalPseudo( function( matchIndexes, length ) { + var i = 1; + for ( ; i < length; i += 2 ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? + argument + length : + argument > length ? + length : + argument; + for ( ; --i >= 0; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ), + + "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { + var i = argument < 0 ? argument + length : argument; + for ( ; ++i < length; ) { + matchIndexes.push( i ); + } + return matchIndexes; + } ) + } +}; + +Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; + +// Add button/input type pseudos +for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { + Expr.pseudos[ i ] = createInputPseudo( i ); +} +for ( i in { submit: true, reset: true } ) { + Expr.pseudos[ i ] = createButtonPseudo( i ); +} + +// Easy API for creating new setFilters +function setFilters() {} +setFilters.prototype = Expr.filters = Expr.pseudos; +Expr.setFilters = new setFilters(); + +tokenize = Sizzle.tokenize = function( selector, parseOnly ) { + var matched, match, tokens, type, + soFar, groups, preFilters, + cached = tokenCache[ selector + " " ]; + + if ( cached ) { + return parseOnly ? 0 : cached.slice( 0 ); + } + + soFar = selector; + groups = []; + preFilters = Expr.preFilter; + + while ( soFar ) { + + // Comma and first run + if ( !matched || ( match = rcomma.exec( soFar ) ) ) { + if ( match ) { + + // Don't consume trailing commas as valid + soFar = soFar.slice( match[ 0 ].length ) || soFar; + } + groups.push( ( tokens = [] ) ); + } + + matched = false; + + // Combinators + if ( ( match = rcombinators.exec( soFar ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + + // Cast descendant combinators to space + type: match[ 0 ].replace( rtrim, " " ) + } ); + soFar = soFar.slice( matched.length ); + } + + // Filters + for ( type in Expr.filter ) { + if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || + ( match = preFilters[ type ]( match ) ) ) ) { + matched = match.shift(); + tokens.push( { + value: matched, + type: type, + matches: match + } ); + soFar = soFar.slice( matched.length ); + } + } + + if ( !matched ) { + break; + } + } + + // Return the length of the invalid excess + // if we're just parsing + // Otherwise, throw an error or return tokens + return parseOnly ? + soFar.length : + soFar ? + Sizzle.error( selector ) : + + // Cache the tokens + tokenCache( selector, groups ).slice( 0 ); +}; + +function toSelector( tokens ) { + var i = 0, + len = tokens.length, + selector = ""; + for ( ; i < len; i++ ) { + selector += tokens[ i ].value; + } + return selector; +} + +function addCombinator( matcher, combinator, base ) { + var dir = combinator.dir, + skip = combinator.next, + key = skip || dir, + checkNonElements = base && key === "parentNode", + doneName = done++; + + return combinator.first ? + + // Check against closest ancestor/preceding element + function( elem, context, xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + return matcher( elem, context, xml ); + } + } + return false; + } : + + // Check against all ancestor/preceding elements + function( elem, context, xml ) { + var oldCache, uniqueCache, outerCache, + newCache = [ dirruns, doneName ]; + + // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching + if ( xml ) { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + if ( matcher( elem, context, xml ) ) { + return true; + } + } + } + } else { + while ( ( elem = elem[ dir ] ) ) { + if ( elem.nodeType === 1 || checkNonElements ) { + outerCache = elem[ expando ] || ( elem[ expando ] = {} ); + + // Support: IE <9 only + // Defend against cloned attroperties (jQuery gh-1709) + uniqueCache = outerCache[ elem.uniqueID ] || + ( outerCache[ elem.uniqueID ] = {} ); + + if ( skip && skip === elem.nodeName.toLowerCase() ) { + elem = elem[ dir ] || elem; + } else if ( ( oldCache = uniqueCache[ key ] ) && + oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { + + // Assign to newCache so results back-propagate to previous elements + return ( newCache[ 2 ] = oldCache[ 2 ] ); + } else { + + // Reuse newcache so results back-propagate to previous elements + uniqueCache[ key ] = newCache; + + // A match means we're done; a fail means we have to keep checking + if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { + return true; + } + } + } + } + } + return false; + }; +} + +function elementMatcher( matchers ) { + return matchers.length > 1 ? + function( elem, context, xml ) { + var i = matchers.length; + while ( i-- ) { + if ( !matchers[ i ]( elem, context, xml ) ) { + return false; + } + } + return true; + } : + matchers[ 0 ]; +} + +function multipleContexts( selector, contexts, results ) { + var i = 0, + len = contexts.length; + for ( ; i < len; i++ ) { + Sizzle( selector, contexts[ i ], results ); + } + return results; +} + +function condense( unmatched, map, filter, context, xml ) { + var elem, + newUnmatched = [], + i = 0, + len = unmatched.length, + mapped = map != null; + + for ( ; i < len; i++ ) { + if ( ( elem = unmatched[ i ] ) ) { + if ( !filter || filter( elem, context, xml ) ) { + newUnmatched.push( elem ); + if ( mapped ) { + map.push( i ); + } + } + } + } + + return newUnmatched; +} + +function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { + if ( postFilter && !postFilter[ expando ] ) { + postFilter = setMatcher( postFilter ); + } + if ( postFinder && !postFinder[ expando ] ) { + postFinder = setMatcher( postFinder, postSelector ); + } + return markFunction( function( seed, results, context, xml ) { + var temp, i, elem, + preMap = [], + postMap = [], + preexisting = results.length, + + // Get initial elements from seed or context + elems = seed || multipleContexts( + selector || "*", + context.nodeType ? [ context ] : context, + [] + ), + + // Prefilter to get matcher input, preserving a map for seed-results synchronization + matcherIn = preFilter && ( seed || !selector ) ? + condense( elems, preMap, preFilter, context, xml ) : + elems, + + matcherOut = matcher ? + + // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, + postFinder || ( seed ? preFilter : preexisting || postFilter ) ? + + // ...intermediate processing is necessary + [] : + + // ...otherwise use results directly + results : + matcherIn; + + // Find primary matches + if ( matcher ) { + matcher( matcherIn, matcherOut, context, xml ); + } + + // Apply postFilter + if ( postFilter ) { + temp = condense( matcherOut, postMap ); + postFilter( temp, [], context, xml ); + + // Un-match failing elements by moving them back to matcherIn + i = temp.length; + while ( i-- ) { + if ( ( elem = temp[ i ] ) ) { + matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); + } + } + } + + if ( seed ) { + if ( postFinder || preFilter ) { + if ( postFinder ) { + + // Get the final matcherOut by condensing this intermediate into postFinder contexts + temp = []; + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) ) { + + // Restore matcherIn since elem is not yet a final match + temp.push( ( matcherIn[ i ] = elem ) ); + } + } + postFinder( null, ( matcherOut = [] ), temp, xml ); + } + + // Move matched elements from seed to results to keep them synchronized + i = matcherOut.length; + while ( i-- ) { + if ( ( elem = matcherOut[ i ] ) && + ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { + + seed[ temp ] = !( results[ temp ] = elem ); + } + } + } + + // Add elements to results, through postFinder if defined + } else { + matcherOut = condense( + matcherOut === results ? + matcherOut.splice( preexisting, matcherOut.length ) : + matcherOut + ); + if ( postFinder ) { + postFinder( null, results, matcherOut, xml ); + } else { + push.apply( results, matcherOut ); + } + } + } ); +} + +function matcherFromTokens( tokens ) { + var checkContext, matcher, j, + len = tokens.length, + leadingRelative = Expr.relative[ tokens[ 0 ].type ], + implicitRelative = leadingRelative || Expr.relative[ " " ], + i = leadingRelative ? 1 : 0, + + // The foundational matcher ensures that elements are reachable from top-level context(s) + matchContext = addCombinator( function( elem ) { + return elem === checkContext; + }, implicitRelative, true ), + matchAnyContext = addCombinator( function( elem ) { + return indexOf( checkContext, elem ) > -1; + }, implicitRelative, true ), + matchers = [ function( elem, context, xml ) { + var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( + ( checkContext = context ).nodeType ? + matchContext( elem, context, xml ) : + matchAnyContext( elem, context, xml ) ); + + // Avoid hanging onto element (issue #299) + checkContext = null; + return ret; + } ]; + + for ( ; i < len; i++ ) { + if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { + matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; + } else { + matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); + + // Return special upon seeing a positional matcher + if ( matcher[ expando ] ) { + + // Find the next relative operator (if any) for proper handling + j = ++i; + for ( ; j < len; j++ ) { + if ( Expr.relative[ tokens[ j ].type ] ) { + break; + } + } + return setMatcher( + i > 1 && elementMatcher( matchers ), + i > 1 && toSelector( + + // If the preceding token was a descendant combinator, insert an implicit any-element `*` + tokens + .slice( 0, i - 1 ) + .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) + ).replace( rtrim, "$1" ), + matcher, + i < j && matcherFromTokens( tokens.slice( i, j ) ), + j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), + j < len && toSelector( tokens ) + ); + } + matchers.push( matcher ); + } + } + + return elementMatcher( matchers ); +} + +function matcherFromGroupMatchers( elementMatchers, setMatchers ) { + var bySet = setMatchers.length > 0, + byElement = elementMatchers.length > 0, + superMatcher = function( seed, context, xml, results, outermost ) { + var elem, j, matcher, + matchedCount = 0, + i = "0", + unmatched = seed && [], + setMatched = [], + contextBackup = outermostContext, + + // We must always have either seed elements or outermost context + elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), + + // Use integer dirruns iff this is the outermost matcher + dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), + len = elems.length; + + if ( outermost ) { + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + outermostContext = context == document || context || outermost; + } + + // Add elements passing elementMatchers directly to results + // Support: IE<9, Safari + // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id + for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { + if ( byElement && elem ) { + j = 0; + + // Support: IE 11+, Edge 17 - 18+ + // IE/Edge sometimes throw a "Permission denied" error when strict-comparing + // two documents; shallow comparisons work. + // eslint-disable-next-line eqeqeq + if ( !context && elem.ownerDocument != document ) { + setDocument( elem ); + xml = !documentIsHTML; + } + while ( ( matcher = elementMatchers[ j++ ] ) ) { + if ( matcher( elem, context || document, xml ) ) { + results.push( elem ); + break; + } + } + if ( outermost ) { + dirruns = dirrunsUnique; + } + } + + // Track unmatched elements for set filters + if ( bySet ) { + + // They will have gone through all possible matchers + if ( ( elem = !matcher && elem ) ) { + matchedCount--; + } + + // Lengthen the array for every element, matched or not + if ( seed ) { + unmatched.push( elem ); + } + } + } + + // `i` is now the count of elements visited above, and adding it to `matchedCount` + // makes the latter nonnegative. + matchedCount += i; + + // Apply set filters to unmatched elements + // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` + // equals `i`), unless we didn't visit _any_ elements in the above loop because we have + // no element matchers and no seed. + // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that + // case, which will result in a "00" `matchedCount` that differs from `i` but is also + // numerically zero. + if ( bySet && i !== matchedCount ) { + j = 0; + while ( ( matcher = setMatchers[ j++ ] ) ) { + matcher( unmatched, setMatched, context, xml ); + } + + if ( seed ) { + + // Reintegrate element matches to eliminate the need for sorting + if ( matchedCount > 0 ) { + while ( i-- ) { + if ( !( unmatched[ i ] || setMatched[ i ] ) ) { + setMatched[ i ] = pop.call( results ); + } + } + } + + // Discard index placeholder values to get only actual matches + setMatched = condense( setMatched ); + } + + // Add matches to results + push.apply( results, setMatched ); + + // Seedless set matches succeeding multiple successful matchers stipulate sorting + if ( outermost && !seed && setMatched.length > 0 && + ( matchedCount + setMatchers.length ) > 1 ) { + + Sizzle.uniqueSort( results ); + } + } + + // Override manipulation of globals by nested matchers + if ( outermost ) { + dirruns = dirrunsUnique; + outermostContext = contextBackup; + } + + return unmatched; + }; + + return bySet ? + markFunction( superMatcher ) : + superMatcher; +} + +compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { + var i, + setMatchers = [], + elementMatchers = [], + cached = compilerCache[ selector + " " ]; + + if ( !cached ) { + + // Generate a function of recursive functions that can be used to check each element + if ( !match ) { + match = tokenize( selector ); + } + i = match.length; + while ( i-- ) { + cached = matcherFromTokens( match[ i ] ); + if ( cached[ expando ] ) { + setMatchers.push( cached ); + } else { + elementMatchers.push( cached ); + } + } + + // Cache the compiled function + cached = compilerCache( + selector, + matcherFromGroupMatchers( elementMatchers, setMatchers ) + ); + + // Save selector and tokenization + cached.selector = selector; + } + return cached; +}; + +/** + * A low-level selection function that works with Sizzle's compiled + * selector functions + * @param {String|Function} selector A selector or a pre-compiled + * selector function built with Sizzle.compile + * @param {Element} context + * @param {Array} [results] + * @param {Array} [seed] A set of elements to match against + */ +select = Sizzle.select = function( selector, context, results, seed ) { + var i, tokens, token, type, find, + compiled = typeof selector === "function" && selector, + match = !seed && tokenize( ( selector = compiled.selector || selector ) ); + + results = results || []; + + // Try to minimize operations if there is only one selector in the list and no seed + // (the latter of which guarantees us context) + if ( match.length === 1 ) { + + // Reduce context if the leading compound selector is an ID + tokens = match[ 0 ] = match[ 0 ].slice( 0 ); + if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && + context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { + + context = ( Expr.find[ "ID" ]( token.matches[ 0 ] + .replace( runescape, funescape ), context ) || [] )[ 0 ]; + if ( !context ) { + return results; + + // Precompiled matchers will still verify ancestry, so step up a level + } else if ( compiled ) { + context = context.parentNode; + } + + selector = selector.slice( tokens.shift().value.length ); + } + + // Fetch a seed set for right-to-left matching + i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; + while ( i-- ) { + token = tokens[ i ]; + + // Abort if we hit a combinator + if ( Expr.relative[ ( type = token.type ) ] ) { + break; + } + if ( ( find = Expr.find[ type ] ) ) { + + // Search, expanding context for leading sibling combinators + if ( ( seed = find( + token.matches[ 0 ].replace( runescape, funescape ), + rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || + context + ) ) ) { + + // If seed is empty or no tokens remain, we can return early + tokens.splice( i, 1 ); + selector = seed.length && toSelector( tokens ); + if ( !selector ) { + push.apply( results, seed ); + return results; + } + + break; + } + } + } + } + + // Compile and execute a filtering function if one is not provided + // Provide `match` to avoid retokenization if we modified the selector above + ( compiled || compile( selector, match ) )( + seed, + context, + !documentIsHTML, + results, + !context || rsibling.test( selector ) && testContext( context.parentNode ) || context + ); + return results; +}; + +// One-time assignments + +// Sort stability +support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; + +// Support: Chrome 14-35+ +// Always assume duplicates if they aren't passed to the comparison function +support.detectDuplicates = !!hasDuplicate; + +// Initialize against the default document +setDocument(); + +// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) +// Detached nodes confoundingly follow *each other* +support.sortDetached = assert( function( el ) { + + // Should return 1, but returns 4 (following) + return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; +} ); + +// Support: IE<8 +// Prevent attribute/property "interpolation" +// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx +if ( !assert( function( el ) { + el.innerHTML = ""; + return el.firstChild.getAttribute( "href" ) === "#"; +} ) ) { + addHandle( "type|href|height|width", function( elem, name, isXML ) { + if ( !isXML ) { + return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); + } + } ); +} + +// Support: IE<9 +// Use defaultValue in place of getAttribute("value") +if ( !support.attributes || !assert( function( el ) { + el.innerHTML = ""; + el.firstChild.setAttribute( "value", "" ); + return el.firstChild.getAttribute( "value" ) === ""; +} ) ) { + addHandle( "value", function( elem, _name, isXML ) { + if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { + return elem.defaultValue; + } + } ); +} + +// Support: IE<9 +// Use getAttributeNode to fetch booleans when getAttribute lies +if ( !assert( function( el ) { + return el.getAttribute( "disabled" ) == null; +} ) ) { + addHandle( booleans, function( elem, name, isXML ) { + var val; + if ( !isXML ) { + return elem[ name ] === true ? name.toLowerCase() : + ( val = elem.getAttributeNode( name ) ) && val.specified ? + val.value : + null; + } + } ); +} + +return Sizzle; + +} )( window ); + + + +jQuery.find = Sizzle; +jQuery.expr = Sizzle.selectors; + +// Deprecated +jQuery.expr[ ":" ] = jQuery.expr.pseudos; +jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; +jQuery.text = Sizzle.getText; +jQuery.isXMLDoc = Sizzle.isXML; +jQuery.contains = Sizzle.contains; +jQuery.escapeSelector = Sizzle.escape; + + + + +var dir = function( elem, dir, until ) { + var matched = [], + truncate = until !== undefined; + + while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { + if ( elem.nodeType === 1 ) { + if ( truncate && jQuery( elem ).is( until ) ) { + break; + } + matched.push( elem ); + } + } + return matched; +}; + + +var siblings = function( n, elem ) { + var matched = []; + + for ( ; n; n = n.nextSibling ) { + if ( n.nodeType === 1 && n !== elem ) { + matched.push( n ); + } + } + + return matched; +}; + + +var rneedsContext = jQuery.expr.match.needsContext; + + + +function nodeName( elem, name ) { + + return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); + +}; +var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); + + + +// Implement the identical functionality for filter and not +function winnow( elements, qualifier, not ) { + if ( isFunction( qualifier ) ) { + return jQuery.grep( elements, function( elem, i ) { + return !!qualifier.call( elem, i, elem ) !== not; + } ); + } + + // Single element + if ( qualifier.nodeType ) { + return jQuery.grep( elements, function( elem ) { + return ( elem === qualifier ) !== not; + } ); + } + + // Arraylike of elements (jQuery, arguments, Array) + if ( typeof qualifier !== "string" ) { + return jQuery.grep( elements, function( elem ) { + return ( indexOf.call( qualifier, elem ) > -1 ) !== not; + } ); + } + + // Filtered directly for both simple and complex selectors + return jQuery.filter( qualifier, elements, not ); +} + +jQuery.filter = function( expr, elems, not ) { + var elem = elems[ 0 ]; + + if ( not ) { + expr = ":not(" + expr + ")"; + } + + if ( elems.length === 1 && elem.nodeType === 1 ) { + return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; + } + + return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { + return elem.nodeType === 1; + } ) ); +}; + +jQuery.fn.extend( { + find: function( selector ) { + var i, ret, + len = this.length, + self = this; + + if ( typeof selector !== "string" ) { + return this.pushStack( jQuery( selector ).filter( function() { + for ( i = 0; i < len; i++ ) { + if ( jQuery.contains( self[ i ], this ) ) { + return true; + } + } + } ) ); + } + + ret = this.pushStack( [] ); + + for ( i = 0; i < len; i++ ) { + jQuery.find( selector, self[ i ], ret ); + } + + return len > 1 ? jQuery.uniqueSort( ret ) : ret; + }, + filter: function( selector ) { + return this.pushStack( winnow( this, selector || [], false ) ); + }, + not: function( selector ) { + return this.pushStack( winnow( this, selector || [], true ) ); + }, + is: function( selector ) { + return !!winnow( + this, + + // If this is a positional/relative selector, check membership in the returned set + // so $("p:first").is("p:last") won't return true for a doc with two "p". + typeof selector === "string" && rneedsContext.test( selector ) ? + jQuery( selector ) : + selector || [], + false + ).length; + } +} ); + + +// Initialize a jQuery object + + +// A central reference to the root jQuery(document) +var rootjQuery, + + // A simple way to check for HTML strings + // Prioritize #id over to avoid XSS via location.hash (#9521) + // Strict HTML recognition (#11290: must start with <) + // Shortcut simple #id case for speed + rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, + + init = jQuery.fn.init = function( selector, context, root ) { + var match, elem; + + // HANDLE: $(""), $(null), $(undefined), $(false) + if ( !selector ) { + return this; + } + + // Method init() accepts an alternate rootjQuery + // so migrate can support jQuery.sub (gh-2101) + root = root || rootjQuery; + + // Handle HTML strings + if ( typeof selector === "string" ) { + if ( selector[ 0 ] === "<" && + selector[ selector.length - 1 ] === ">" && + selector.length >= 3 ) { + + // Assume that strings that start and end with <> are HTML and skip the regex check + match = [ null, selector, null ]; + + } else { + match = rquickExpr.exec( selector ); + } + + // Match html or make sure no context is specified for #id + if ( match && ( match[ 1 ] || !context ) ) { + + // HANDLE: $(html) -> $(array) + if ( match[ 1 ] ) { + context = context instanceof jQuery ? context[ 0 ] : context; + + // Option to run scripts is true for back-compat + // Intentionally let the error be thrown if parseHTML is not present + jQuery.merge( this, jQuery.parseHTML( + match[ 1 ], + context && context.nodeType ? context.ownerDocument || context : document, + true + ) ); + + // HANDLE: $(html, props) + if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { + for ( match in context ) { + + // Properties of context are called as methods if possible + if ( isFunction( this[ match ] ) ) { + this[ match ]( context[ match ] ); + + // ...and otherwise set as attributes + } else { + this.attr( match, context[ match ] ); + } + } + } + + return this; + + // HANDLE: $(#id) + } else { + elem = document.getElementById( match[ 2 ] ); + + if ( elem ) { + + // Inject the element directly into the jQuery object + this[ 0 ] = elem; + this.length = 1; + } + return this; + } + + // HANDLE: $(expr, $(...)) + } else if ( !context || context.jquery ) { + return ( context || root ).find( selector ); + + // HANDLE: $(expr, context) + // (which is just equivalent to: $(context).find(expr) + } else { + return this.constructor( context ).find( selector ); + } + + // HANDLE: $(DOMElement) + } else if ( selector.nodeType ) { + this[ 0 ] = selector; + this.length = 1; + return this; + + // HANDLE: $(function) + // Shortcut for document ready + } else if ( isFunction( selector ) ) { + return root.ready !== undefined ? + root.ready( selector ) : + + // Execute immediately if ready is not present + selector( jQuery ); + } + + return jQuery.makeArray( selector, this ); + }; + +// Give the init function the jQuery prototype for later instantiation +init.prototype = jQuery.fn; + +// Initialize central reference +rootjQuery = jQuery( document ); + + +var rparentsprev = /^(?:parents|prev(?:Until|All))/, + + // Methods guaranteed to produce a unique set when starting from a unique set + guaranteedUnique = { + children: true, + contents: true, + next: true, + prev: true + }; + +jQuery.fn.extend( { + has: function( target ) { + var targets = jQuery( target, this ), + l = targets.length; + + return this.filter( function() { + var i = 0; + for ( ; i < l; i++ ) { + if ( jQuery.contains( this, targets[ i ] ) ) { + return true; + } + } + } ); + }, + + closest: function( selectors, context ) { + var cur, + i = 0, + l = this.length, + matched = [], + targets = typeof selectors !== "string" && jQuery( selectors ); + + // Positional selectors never match, since there's no _selection_ context + if ( !rneedsContext.test( selectors ) ) { + for ( ; i < l; i++ ) { + for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { + + // Always skip document fragments + if ( cur.nodeType < 11 && ( targets ? + targets.index( cur ) > -1 : + + // Don't pass non-elements to Sizzle + cur.nodeType === 1 && + jQuery.find.matchesSelector( cur, selectors ) ) ) { + + matched.push( cur ); + break; + } + } + } + } + + return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); + }, + + // Determine the position of an element within the set + index: function( elem ) { + + // No argument, return index in parent + if ( !elem ) { + return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; + } + + // Index in selector + if ( typeof elem === "string" ) { + return indexOf.call( jQuery( elem ), this[ 0 ] ); + } + + // Locate the position of the desired element + return indexOf.call( this, + + // If it receives a jQuery object, the first element is used + elem.jquery ? elem[ 0 ] : elem + ); + }, + + add: function( selector, context ) { + return this.pushStack( + jQuery.uniqueSort( + jQuery.merge( this.get(), jQuery( selector, context ) ) + ) + ); + }, + + addBack: function( selector ) { + return this.add( selector == null ? + this.prevObject : this.prevObject.filter( selector ) + ); + } +} ); + +function sibling( cur, dir ) { + while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} + return cur; +} + +jQuery.each( { + parent: function( elem ) { + var parent = elem.parentNode; + return parent && parent.nodeType !== 11 ? parent : null; + }, + parents: function( elem ) { + return dir( elem, "parentNode" ); + }, + parentsUntil: function( elem, _i, until ) { + return dir( elem, "parentNode", until ); + }, + next: function( elem ) { + return sibling( elem, "nextSibling" ); + }, + prev: function( elem ) { + return sibling( elem, "previousSibling" ); + }, + nextAll: function( elem ) { + return dir( elem, "nextSibling" ); + }, + prevAll: function( elem ) { + return dir( elem, "previousSibling" ); + }, + nextUntil: function( elem, _i, until ) { + return dir( elem, "nextSibling", until ); + }, + prevUntil: function( elem, _i, until ) { + return dir( elem, "previousSibling", until ); + }, + siblings: function( elem ) { + return siblings( ( elem.parentNode || {} ).firstChild, elem ); + }, + children: function( elem ) { + return siblings( elem.firstChild ); + }, + contents: function( elem ) { + if ( elem.contentDocument != null && + + // Support: IE 11+ + // elements with no `data` attribute has an object + // `contentDocument` with a `null` prototype. + getProto( elem.contentDocument ) ) { + + return elem.contentDocument; + } + + // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only + // Treat the template element as a regular one in browsers that + // don't support it. + if ( nodeName( elem, "template" ) ) { + elem = elem.content || elem; + } + + return jQuery.merge( [], elem.childNodes ); + } +}, function( name, fn ) { + jQuery.fn[ name ] = function( until, selector ) { + var matched = jQuery.map( this, fn, until ); + + if ( name.slice( -5 ) !== "Until" ) { + selector = until; + } + + if ( selector && typeof selector === "string" ) { + matched = jQuery.filter( selector, matched ); + } + + if ( this.length > 1 ) { + + // Remove duplicates + if ( !guaranteedUnique[ name ] ) { + jQuery.uniqueSort( matched ); + } + + // Reverse order for parents* and prev-derivatives + if ( rparentsprev.test( name ) ) { + matched.reverse(); + } + } + + return this.pushStack( matched ); + }; +} ); +var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); + + + +// Convert String-formatted options into Object-formatted ones +function createOptions( options ) { + var object = {}; + jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { + object[ flag ] = true; + } ); + return object; +} + +/* + * Create a callback list using the following parameters: + * + * options: an optional list of space-separated options that will change how + * the callback list behaves or a more traditional option object + * + * By default a callback list will act like an event callback list and can be + * "fired" multiple times. + * + * Possible options: + * + * once: will ensure the callback list can only be fired once (like a Deferred) + * + * memory: will keep track of previous values and will call any callback added + * after the list has been fired right away with the latest "memorized" + * values (like a Deferred) + * + * unique: will ensure a callback can only be added once (no duplicate in the list) + * + * stopOnFalse: interrupt callings when a callback returns false + * + */ +jQuery.Callbacks = function( options ) { + + // Convert options from String-formatted to Object-formatted if needed + // (we check in cache first) + options = typeof options === "string" ? + createOptions( options ) : + jQuery.extend( {}, options ); + + var // Flag to know if list is currently firing + firing, + + // Last fire value for non-forgettable lists + memory, + + // Flag to know if list was already fired + fired, + + // Flag to prevent firing + locked, + + // Actual callback list + list = [], + + // Queue of execution data for repeatable lists + queue = [], + + // Index of currently firing callback (modified by add/remove as needed) + firingIndex = -1, + + // Fire callbacks + fire = function() { + + // Enforce single-firing + locked = locked || options.once; + + // Execute callbacks for all pending executions, + // respecting firingIndex overrides and runtime changes + fired = firing = true; + for ( ; queue.length; firingIndex = -1 ) { + memory = queue.shift(); + while ( ++firingIndex < list.length ) { + + // Run callback and check for early termination + if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && + options.stopOnFalse ) { + + // Jump to end and forget the data so .add doesn't re-fire + firingIndex = list.length; + memory = false; + } + } + } + + // Forget the data if we're done with it + if ( !options.memory ) { + memory = false; + } + + firing = false; + + // Clean up if we're done firing for good + if ( locked ) { + + // Keep an empty list if we have data for future add calls + if ( memory ) { + list = []; + + // Otherwise, this object is spent + } else { + list = ""; + } + } + }, + + // Actual Callbacks object + self = { + + // Add a callback or a collection of callbacks to the list + add: function() { + if ( list ) { + + // If we have memory from a past run, we should fire after adding + if ( memory && !firing ) { + firingIndex = list.length - 1; + queue.push( memory ); + } + + ( function add( args ) { + jQuery.each( args, function( _, arg ) { + if ( isFunction( arg ) ) { + if ( !options.unique || !self.has( arg ) ) { + list.push( arg ); + } + } else if ( arg && arg.length && toType( arg ) !== "string" ) { + + // Inspect recursively + add( arg ); + } + } ); + } )( arguments ); + + if ( memory && !firing ) { + fire(); + } + } + return this; + }, + + // Remove a callback from the list + remove: function() { + jQuery.each( arguments, function( _, arg ) { + var index; + while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { + list.splice( index, 1 ); + + // Handle firing indexes + if ( index <= firingIndex ) { + firingIndex--; + } + } + } ); + return this; + }, + + // Check if a given callback is in the list. + // If no argument is given, return whether or not list has callbacks attached. + has: function( fn ) { + return fn ? + jQuery.inArray( fn, list ) > -1 : + list.length > 0; + }, + + // Remove all callbacks from the list + empty: function() { + if ( list ) { + list = []; + } + return this; + }, + + // Disable .fire and .add + // Abort any current/pending executions + // Clear all callbacks and values + disable: function() { + locked = queue = []; + list = memory = ""; + return this; + }, + disabled: function() { + return !list; + }, + + // Disable .fire + // Also disable .add unless we have memory (since it would have no effect) + // Abort any pending executions + lock: function() { + locked = queue = []; + if ( !memory && !firing ) { + list = memory = ""; + } + return this; + }, + locked: function() { + return !!locked; + }, + + // Call all callbacks with the given context and arguments + fireWith: function( context, args ) { + if ( !locked ) { + args = args || []; + args = [ context, args.slice ? args.slice() : args ]; + queue.push( args ); + if ( !firing ) { + fire(); + } + } + return this; + }, + + // Call all the callbacks with the given arguments + fire: function() { + self.fireWith( this, arguments ); + return this; + }, + + // To know if the callbacks have already been called at least once + fired: function() { + return !!fired; + } + }; + + return self; +}; + + +function Identity( v ) { + return v; +} +function Thrower( ex ) { + throw ex; +} + +function adoptValue( value, resolve, reject, noValue ) { + var method; + + try { + + // Check for promise aspect first to privilege synchronous behavior + if ( value && isFunction( ( method = value.promise ) ) ) { + method.call( value ).done( resolve ).fail( reject ); + + // Other thenables + } else if ( value && isFunction( ( method = value.then ) ) ) { + method.call( value, resolve, reject ); + + // Other non-thenables + } else { + + // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: + // * false: [ value ].slice( 0 ) => resolve( value ) + // * true: [ value ].slice( 1 ) => resolve() + resolve.apply( undefined, [ value ].slice( noValue ) ); + } + + // For Promises/A+, convert exceptions into rejections + // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in + // Deferred#then to conditionally suppress rejection. + } catch ( value ) { + + // Support: Android 4.0 only + // Strict mode functions invoked without .call/.apply get global-object context + reject.apply( undefined, [ value ] ); + } +} + +jQuery.extend( { + + Deferred: function( func ) { + var tuples = [ + + // action, add listener, callbacks, + // ... .then handlers, argument index, [final state] + [ "notify", "progress", jQuery.Callbacks( "memory" ), + jQuery.Callbacks( "memory" ), 2 ], + [ "resolve", "done", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 0, "resolved" ], + [ "reject", "fail", jQuery.Callbacks( "once memory" ), + jQuery.Callbacks( "once memory" ), 1, "rejected" ] + ], + state = "pending", + promise = { + state: function() { + return state; + }, + always: function() { + deferred.done( arguments ).fail( arguments ); + return this; + }, + "catch": function( fn ) { + return promise.then( null, fn ); + }, + + // Keep pipe for back-compat + pipe: function( /* fnDone, fnFail, fnProgress */ ) { + var fns = arguments; + + return jQuery.Deferred( function( newDefer ) { + jQuery.each( tuples, function( _i, tuple ) { + + // Map tuples (progress, done, fail) to arguments (done, fail, progress) + var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; + + // deferred.progress(function() { bind to newDefer or newDefer.notify }) + // deferred.done(function() { bind to newDefer or newDefer.resolve }) + // deferred.fail(function() { bind to newDefer or newDefer.reject }) + deferred[ tuple[ 1 ] ]( function() { + var returned = fn && fn.apply( this, arguments ); + if ( returned && isFunction( returned.promise ) ) { + returned.promise() + .progress( newDefer.notify ) + .done( newDefer.resolve ) + .fail( newDefer.reject ); + } else { + newDefer[ tuple[ 0 ] + "With" ]( + this, + fn ? [ returned ] : arguments + ); + } + } ); + } ); + fns = null; + } ).promise(); + }, + then: function( onFulfilled, onRejected, onProgress ) { + var maxDepth = 0; + function resolve( depth, deferred, handler, special ) { + return function() { + var that = this, + args = arguments, + mightThrow = function() { + var returned, then; + + // Support: Promises/A+ section 2.3.3.3.3 + // https://promisesaplus.com/#point-59 + // Ignore double-resolution attempts + if ( depth < maxDepth ) { + return; + } + + returned = handler.apply( that, args ); + + // Support: Promises/A+ section 2.3.1 + // https://promisesaplus.com/#point-48 + if ( returned === deferred.promise() ) { + throw new TypeError( "Thenable self-resolution" ); + } + + // Support: Promises/A+ sections 2.3.3.1, 3.5 + // https://promisesaplus.com/#point-54 + // https://promisesaplus.com/#point-75 + // Retrieve `then` only once + then = returned && + + // Support: Promises/A+ section 2.3.4 + // https://promisesaplus.com/#point-64 + // Only check objects and functions for thenability + ( typeof returned === "object" || + typeof returned === "function" ) && + returned.then; + + // Handle a returned thenable + if ( isFunction( then ) ) { + + // Special processors (notify) just wait for resolution + if ( special ) { + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ) + ); + + // Normal processors (resolve) also hook into progress + } else { + + // ...and disregard older resolution values + maxDepth++; + + then.call( + returned, + resolve( maxDepth, deferred, Identity, special ), + resolve( maxDepth, deferred, Thrower, special ), + resolve( maxDepth, deferred, Identity, + deferred.notifyWith ) + ); + } + + // Handle all other returned values + } else { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Identity ) { + that = undefined; + args = [ returned ]; + } + + // Process the value(s) + // Default process is resolve + ( special || deferred.resolveWith )( that, args ); + } + }, + + // Only normal processors (resolve) catch and reject exceptions + process = special ? + mightThrow : + function() { + try { + mightThrow(); + } catch ( e ) { + + if ( jQuery.Deferred.exceptionHook ) { + jQuery.Deferred.exceptionHook( e, + process.stackTrace ); + } + + // Support: Promises/A+ section 2.3.3.3.4.1 + // https://promisesaplus.com/#point-61 + // Ignore post-resolution exceptions + if ( depth + 1 >= maxDepth ) { + + // Only substitute handlers pass on context + // and multiple values (non-spec behavior) + if ( handler !== Thrower ) { + that = undefined; + args = [ e ]; + } + + deferred.rejectWith( that, args ); + } + } + }; + + // Support: Promises/A+ section 2.3.3.3.1 + // https://promisesaplus.com/#point-57 + // Re-resolve promises immediately to dodge false rejection from + // subsequent errors + if ( depth ) { + process(); + } else { + + // Call an optional hook to record the stack, in case of exception + // since it's otherwise lost when execution goes async + if ( jQuery.Deferred.getStackHook ) { + process.stackTrace = jQuery.Deferred.getStackHook(); + } + window.setTimeout( process ); + } + }; + } + + return jQuery.Deferred( function( newDefer ) { + + // progress_handlers.add( ... ) + tuples[ 0 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onProgress ) ? + onProgress : + Identity, + newDefer.notifyWith + ) + ); + + // fulfilled_handlers.add( ... ) + tuples[ 1 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onFulfilled ) ? + onFulfilled : + Identity + ) + ); + + // rejected_handlers.add( ... ) + tuples[ 2 ][ 3 ].add( + resolve( + 0, + newDefer, + isFunction( onRejected ) ? + onRejected : + Thrower + ) + ); + } ).promise(); + }, + + // Get a promise for this deferred + // If obj is provided, the promise aspect is added to the object + promise: function( obj ) { + return obj != null ? jQuery.extend( obj, promise ) : promise; + } + }, + deferred = {}; + + // Add list-specific methods + jQuery.each( tuples, function( i, tuple ) { + var list = tuple[ 2 ], + stateString = tuple[ 5 ]; + + // promise.progress = list.add + // promise.done = list.add + // promise.fail = list.add + promise[ tuple[ 1 ] ] = list.add; + + // Handle state + if ( stateString ) { + list.add( + function() { + + // state = "resolved" (i.e., fulfilled) + // state = "rejected" + state = stateString; + }, + + // rejected_callbacks.disable + // fulfilled_callbacks.disable + tuples[ 3 - i ][ 2 ].disable, + + // rejected_handlers.disable + // fulfilled_handlers.disable + tuples[ 3 - i ][ 3 ].disable, + + // progress_callbacks.lock + tuples[ 0 ][ 2 ].lock, + + // progress_handlers.lock + tuples[ 0 ][ 3 ].lock + ); + } + + // progress_handlers.fire + // fulfilled_handlers.fire + // rejected_handlers.fire + list.add( tuple[ 3 ].fire ); + + // deferred.notify = function() { deferred.notifyWith(...) } + // deferred.resolve = function() { deferred.resolveWith(...) } + // deferred.reject = function() { deferred.rejectWith(...) } + deferred[ tuple[ 0 ] ] = function() { + deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); + return this; + }; + + // deferred.notifyWith = list.fireWith + // deferred.resolveWith = list.fireWith + // deferred.rejectWith = list.fireWith + deferred[ tuple[ 0 ] + "With" ] = list.fireWith; + } ); + + // Make the deferred a promise + promise.promise( deferred ); + + // Call given func if any + if ( func ) { + func.call( deferred, deferred ); + } + + // All done! + return deferred; + }, + + // Deferred helper + when: function( singleValue ) { + var + + // count of uncompleted subordinates + remaining = arguments.length, + + // count of unprocessed arguments + i = remaining, + + // subordinate fulfillment data + resolveContexts = Array( i ), + resolveValues = slice.call( arguments ), + + // the master Deferred + master = jQuery.Deferred(), + + // subordinate callback factory + updateFunc = function( i ) { + return function( value ) { + resolveContexts[ i ] = this; + resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; + if ( !( --remaining ) ) { + master.resolveWith( resolveContexts, resolveValues ); + } + }; + }; + + // Single- and empty arguments are adopted like Promise.resolve + if ( remaining <= 1 ) { + adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, + !remaining ); + + // Use .then() to unwrap secondary thenables (cf. gh-3000) + if ( master.state() === "pending" || + isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { + + return master.then(); + } + } + + // Multiple arguments are aggregated like Promise.all array elements + while ( i-- ) { + adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); + } + + return master.promise(); + } +} ); + + +// These usually indicate a programmer mistake during development, +// warn about them ASAP rather than swallowing them by default. +var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; + +jQuery.Deferred.exceptionHook = function( error, stack ) { + + // Support: IE 8 - 9 only + // Console exists when dev tools are open, which can happen at any time + if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { + window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); + } +}; + + + + +jQuery.readyException = function( error ) { + window.setTimeout( function() { + throw error; + } ); +}; + + + + +// The deferred used on DOM ready +var readyList = jQuery.Deferred(); + +jQuery.fn.ready = function( fn ) { + + readyList + .then( fn ) + + // Wrap jQuery.readyException in a function so that the lookup + // happens at the time of error handling instead of callback + // registration. + .catch( function( error ) { + jQuery.readyException( error ); + } ); + + return this; +}; + +jQuery.extend( { + + // Is the DOM ready to be used? Set to true once it occurs. + isReady: false, + + // A counter to track how many items to wait for before + // the ready event fires. See #6781 + readyWait: 1, + + // Handle when the DOM is ready + ready: function( wait ) { + + // Abort if there are pending holds or we're already ready + if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { + return; + } + + // Remember that the DOM is ready + jQuery.isReady = true; + + // If a normal DOM Ready event fired, decrement, and wait if need be + if ( wait !== true && --jQuery.readyWait > 0 ) { + return; + } + + // If there are functions bound, to execute + readyList.resolveWith( document, [ jQuery ] ); + } +} ); + +jQuery.ready.then = readyList.then; + +// The ready event handler and self cleanup method +function completed() { + document.removeEventListener( "DOMContentLoaded", completed ); + window.removeEventListener( "load", completed ); + jQuery.ready(); +} + +// Catch cases where $(document).ready() is called +// after the browser event has already occurred. +// Support: IE <=9 - 10 only +// Older IE sometimes signals "interactive" too soon +if ( document.readyState === "complete" || + ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { + + // Handle it asynchronously to allow scripts the opportunity to delay ready + window.setTimeout( jQuery.ready ); + +} else { + + // Use the handy event callback + document.addEventListener( "DOMContentLoaded", completed ); + + // A fallback to window.onload, that will always work + window.addEventListener( "load", completed ); +} + + + + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( !isFunction( value ) ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +}; + + +// Matches dashed string for camelizing +var rmsPrefix = /^-ms-/, + rdashAlpha = /-([a-z])/g; + +// Used by camelCase as callback to replace() +function fcamelCase( _all, letter ) { + return letter.toUpperCase(); +} + +// Convert dashed to camelCase; used by the css and data modules +// Support: IE <=9 - 11, Edge 12 - 15 +// Microsoft forgot to hump their vendor prefix (#9572) +function camelCase( string ) { + return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); +} +var acceptData = function( owner ) { + + // Accepts only: + // - Node + // - Node.ELEMENT_NODE + // - Node.DOCUMENT_NODE + // - Object + // - Any + return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); +}; + + + + +function Data() { + this.expando = jQuery.expando + Data.uid++; +} + +Data.uid = 1; + +Data.prototype = { + + cache: function( owner ) { + + // Check if the owner object already has a cache + var value = owner[ this.expando ]; + + // If not, create one + if ( !value ) { + value = {}; + + // We can accept data for non-element nodes in modern browsers, + // but we should not, see #8335. + // Always return an empty object. + if ( acceptData( owner ) ) { + + // If it is a node unlikely to be stringify-ed or looped over + // use plain assignment + if ( owner.nodeType ) { + owner[ this.expando ] = value; + + // Otherwise secure it in a non-enumerable property + // configurable must be true to allow the property to be + // deleted when data is removed + } else { + Object.defineProperty( owner, this.expando, { + value: value, + configurable: true + } ); + } + } + } + + return value; + }, + set: function( owner, data, value ) { + var prop, + cache = this.cache( owner ); + + // Handle: [ owner, key, value ] args + // Always use camelCase key (gh-2257) + if ( typeof data === "string" ) { + cache[ camelCase( data ) ] = value; + + // Handle: [ owner, { properties } ] args + } else { + + // Copy the properties one-by-one to the cache object + for ( prop in data ) { + cache[ camelCase( prop ) ] = data[ prop ]; + } + } + return cache; + }, + get: function( owner, key ) { + return key === undefined ? + this.cache( owner ) : + + // Always use camelCase key (gh-2257) + owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; + }, + access: function( owner, key, value ) { + + // In cases where either: + // + // 1. No key was specified + // 2. A string key was specified, but no value provided + // + // Take the "read" path and allow the get method to determine + // which value to return, respectively either: + // + // 1. The entire cache object + // 2. The data stored at the key + // + if ( key === undefined || + ( ( key && typeof key === "string" ) && value === undefined ) ) { + + return this.get( owner, key ); + } + + // When the key is not a string, or both a key and value + // are specified, set or extend (existing objects) with either: + // + // 1. An object of properties + // 2. A key and value + // + this.set( owner, key, value ); + + // Since the "set" path can have two possible entry points + // return the expected data based on which path was taken[*] + return value !== undefined ? value : key; + }, + remove: function( owner, key ) { + var i, + cache = owner[ this.expando ]; + + if ( cache === undefined ) { + return; + } + + if ( key !== undefined ) { + + // Support array or space separated string of keys + if ( Array.isArray( key ) ) { + + // If key is an array of keys... + // We always set camelCase keys, so remove that. + key = key.map( camelCase ); + } else { + key = camelCase( key ); + + // If a key with the spaces exists, use it. + // Otherwise, create an array by matching non-whitespace + key = key in cache ? + [ key ] : + ( key.match( rnothtmlwhite ) || [] ); + } + + i = key.length; + + while ( i-- ) { + delete cache[ key[ i ] ]; + } + } + + // Remove the expando if there's no more data + if ( key === undefined || jQuery.isEmptyObject( cache ) ) { + + // Support: Chrome <=35 - 45 + // Webkit & Blink performance suffers when deleting properties + // from DOM nodes, so set to undefined instead + // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) + if ( owner.nodeType ) { + owner[ this.expando ] = undefined; + } else { + delete owner[ this.expando ]; + } + } + }, + hasData: function( owner ) { + var cache = owner[ this.expando ]; + return cache !== undefined && !jQuery.isEmptyObject( cache ); + } +}; +var dataPriv = new Data(); + +var dataUser = new Data(); + + + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11 only + // The attrs elements can be null (#14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + + +jQuery.extend( { + queue: function( elem, type, data ) { + var queue; + + if ( elem ) { + type = ( type || "fx" ) + "queue"; + queue = dataPriv.get( elem, type ); + + // Speed up dequeue by getting out quickly if this is just a lookup + if ( data ) { + if ( !queue || Array.isArray( data ) ) { + queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); + } else { + queue.push( data ); + } + } + return queue || []; + } + }, + + dequeue: function( elem, type ) { + type = type || "fx"; + + var queue = jQuery.queue( elem, type ), + startLength = queue.length, + fn = queue.shift(), + hooks = jQuery._queueHooks( elem, type ), + next = function() { + jQuery.dequeue( elem, type ); + }; + + // If the fx queue is dequeued, always remove the progress sentinel + if ( fn === "inprogress" ) { + fn = queue.shift(); + startLength--; + } + + if ( fn ) { + + // Add a progress sentinel to prevent the fx queue from being + // automatically dequeued + if ( type === "fx" ) { + queue.unshift( "inprogress" ); + } + + // Clear up the last queue stop function + delete hooks.stop; + fn.call( elem, next, hooks ); + } + + if ( !startLength && hooks ) { + hooks.empty.fire(); + } + }, + + // Not public - generate a queueHooks object, or return the current one + _queueHooks: function( elem, type ) { + var key = type + "queueHooks"; + return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { + empty: jQuery.Callbacks( "once memory" ).add( function() { + dataPriv.remove( elem, [ type + "queue", key ] ); + } ) + } ); + } +} ); + +jQuery.fn.extend( { + queue: function( type, data ) { + var setter = 2; + + if ( typeof type !== "string" ) { + data = type; + type = "fx"; + setter--; + } + + if ( arguments.length < setter ) { + return jQuery.queue( this[ 0 ], type ); + } + + return data === undefined ? + this : + this.each( function() { + var queue = jQuery.queue( this, type, data ); + + // Ensure a hooks for this queue + jQuery._queueHooks( this, type ); + + if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { + jQuery.dequeue( this, type ); + } + } ); + }, + dequeue: function( type ) { + return this.each( function() { + jQuery.dequeue( this, type ); + } ); + }, + clearQueue: function( type ) { + return this.queue( type || "fx", [] ); + }, + + // Get a promise resolved when queues of a certain type + // are emptied (fx is the type by default) + promise: function( type, obj ) { + var tmp, + count = 1, + defer = jQuery.Deferred(), + elements = this, + i = this.length, + resolve = function() { + if ( !( --count ) ) { + defer.resolveWith( elements, [ elements ] ); + } + }; + + if ( typeof type !== "string" ) { + obj = type; + type = undefined; + } + type = type || "fx"; + + while ( i-- ) { + tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); + if ( tmp && tmp.empty ) { + count++; + tmp.empty.add( resolve ); + } + } + resolve(); + return defer.promise( obj ); + } +} ); +var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; + +var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); + + +var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; + +var documentElement = document.documentElement; + + + + var isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ); + }, + composed = { composed: true }; + + // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only + // Check attachment across shadow DOM boundaries when possible (gh-3504) + // Support: iOS 10.0-10.2 only + // Early iOS 10 versions support `attachShadow` but not `getRootNode`, + // leading to errors. We need to check for `getRootNode`. + if ( documentElement.getRootNode ) { + isAttached = function( elem ) { + return jQuery.contains( elem.ownerDocument, elem ) || + elem.getRootNode( composed ) === elem.ownerDocument; + }; + } +var isHiddenWithinTree = function( elem, el ) { + + // isHiddenWithinTree might be called from jQuery#filter function; + // in that case, element will be second argument + elem = el || elem; + + // Inline style trumps all + return elem.style.display === "none" || + elem.style.display === "" && + + // Otherwise, check computed style + // Support: Firefox <=43 - 45 + // Disconnected elements can have computed display: none, so first confirm that elem is + // in the document. + isAttached( elem ) && + + jQuery.css( elem, "display" ) === "none"; + }; + + + +function adjustCSS( elem, prop, valueParts, tween ) { + var adjusted, scale, + maxIterations = 20, + currentValue = tween ? + function() { + return tween.cur(); + } : + function() { + return jQuery.css( elem, prop, "" ); + }, + initial = currentValue(), + unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), + + // Starting value computation is required for potential unit mismatches + initialInUnit = elem.nodeType && + ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && + rcssNum.exec( jQuery.css( elem, prop ) ); + + if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { + + // Support: Firefox <=54 + // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) + initial = initial / 2; + + // Trust units reported by jQuery.css + unit = unit || initialInUnit[ 3 ]; + + // Iteratively approximate from a nonzero starting point + initialInUnit = +initial || 1; + + while ( maxIterations-- ) { + + // Evaluate and update our best guess (doubling guesses that zero out). + // Finish if the scale equals or crosses 1 (making the old*new product non-positive). + jQuery.style( elem, prop, initialInUnit + unit ); + if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { + maxIterations = 0; + } + initialInUnit = initialInUnit / scale; + + } + + initialInUnit = initialInUnit * 2; + jQuery.style( elem, prop, initialInUnit + unit ); + + // Make sure we update the tween properties later on + valueParts = valueParts || []; + } + + if ( valueParts ) { + initialInUnit = +initialInUnit || +initial || 0; + + // Apply relative offset (+=/-=) if specified + adjusted = valueParts[ 1 ] ? + initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : + +valueParts[ 2 ]; + if ( tween ) { + tween.unit = unit; + tween.start = initialInUnit; + tween.end = adjusted; + } + } + return adjusted; +} + + +var defaultDisplayMap = {}; + +function getDefaultDisplay( elem ) { + var temp, + doc = elem.ownerDocument, + nodeName = elem.nodeName, + display = defaultDisplayMap[ nodeName ]; + + if ( display ) { + return display; + } + + temp = doc.body.appendChild( doc.createElement( nodeName ) ); + display = jQuery.css( temp, "display" ); + + temp.parentNode.removeChild( temp ); + + if ( display === "none" ) { + display = "block"; + } + defaultDisplayMap[ nodeName ] = display; + + return display; +} + +function showHide( elements, show ) { + var display, elem, + values = [], + index = 0, + length = elements.length; + + // Determine new display value for elements that need to change + for ( ; index < length; index++ ) { + elem = elements[ index ]; + if ( !elem.style ) { + continue; + } + + display = elem.style.display; + if ( show ) { + + // Since we force visibility upon cascade-hidden elements, an immediate (and slow) + // check is required in this first loop unless we have a nonempty display value (either + // inline or about-to-be-restored) + if ( display === "none" ) { + values[ index ] = dataPriv.get( elem, "display" ) || null; + if ( !values[ index ] ) { + elem.style.display = ""; + } + } + if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { + values[ index ] = getDefaultDisplay( elem ); + } + } else { + if ( display !== "none" ) { + values[ index ] = "none"; + + // Remember what we're overwriting + dataPriv.set( elem, "display", display ); + } + } + } + + // Set the display of the elements in a second loop to avoid constant reflow + for ( index = 0; index < length; index++ ) { + if ( values[ index ] != null ) { + elements[ index ].style.display = values[ index ]; + } + } + + return elements; +} + +jQuery.fn.extend( { + show: function() { + return showHide( this, true ); + }, + hide: function() { + return showHide( this ); + }, + toggle: function( state ) { + if ( typeof state === "boolean" ) { + return state ? this.show() : this.hide(); + } + + return this.each( function() { + if ( isHiddenWithinTree( this ) ) { + jQuery( this ).show(); + } else { + jQuery( this ).hide(); + } + } ); + } +} ); +var rcheckableType = ( /^(?:checkbox|radio)$/i ); + +var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); + +var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); + + + +( function() { + var fragment = document.createDocumentFragment(), + div = fragment.appendChild( document.createElement( "div" ) ), + input = document.createElement( "input" ); + + // Support: Android 4.0 - 4.3 only + // Check state lost if the name is set (#11217) + // Support: Windows Web Apps (WWA) + // `name` and `type` must use .setAttribute for WWA (#14901) + input.setAttribute( "type", "radio" ); + input.setAttribute( "checked", "checked" ); + input.setAttribute( "name", "t" ); + + div.appendChild( input ); + + // Support: Android <=4.1 only + // Older WebKit doesn't clone checked state correctly in fragments + support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; + + // Support: IE <=11 only + // Make sure textarea (and checkbox) defaultValue is properly cloned + div.innerHTML = ""; + support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; + + // Support: IE <=9 only + // IE <=9 replaces "; + support.option = !!div.lastChild; +} )(); + + +// We have to close these tags to support XHTML (#13200) +var wrapMap = { + + // XHTML parsers do not magically insert elements in the + // same way that tag soup parsers do. So we cannot shorten + // this by omitting or other required elements. + thead: [ 1, "", "
" ], + col: [ 2, "", "
" ], + tr: [ 2, "", "
" ], + td: [ 3, "", "
" ], + + _default: [ 0, "", "" ] +}; + +wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; +wrapMap.th = wrapMap.td; + +// Support: IE <=9 only +if ( !support.option ) { + wrapMap.optgroup = wrapMap.option = [ 1, "" ]; +} + + +function getAll( context, tag ) { + + // Support: IE <=9 - 11 only + // Use typeof to avoid zero-argument method invocation on host objects (#15151) + var ret; + + if ( typeof context.getElementsByTagName !== "undefined" ) { + ret = context.getElementsByTagName( tag || "*" ); + + } else if ( typeof context.querySelectorAll !== "undefined" ) { + ret = context.querySelectorAll( tag || "*" ); + + } else { + ret = []; + } + + if ( tag === undefined || tag && nodeName( context, tag ) ) { + return jQuery.merge( [ context ], ret ); + } + + return ret; +} + + +// Mark scripts as having already been evaluated +function setGlobalEval( elems, refElements ) { + var i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + dataPriv.set( + elems[ i ], + "globalEval", + !refElements || dataPriv.get( refElements[ i ], "globalEval" ) + ); + } +} + + +var rhtml = /<|&#?\w+;/; + +function buildFragment( elems, context, scripts, selection, ignored ) { + var elem, tmp, tag, wrap, attached, j, + fragment = context.createDocumentFragment(), + nodes = [], + i = 0, + l = elems.length; + + for ( ; i < l; i++ ) { + elem = elems[ i ]; + + if ( elem || elem === 0 ) { + + // Add nodes directly + if ( toType( elem ) === "object" ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); + + // Convert non-html into a text node + } else if ( !rhtml.test( elem ) ) { + nodes.push( context.createTextNode( elem ) ); + + // Convert html into DOM nodes + } else { + tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); + + // Deserialize a standard representation + tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); + wrap = wrapMap[ tag ] || wrapMap._default; + tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; + + // Descend through wrappers to the right content + j = wrap[ 0 ]; + while ( j-- ) { + tmp = tmp.lastChild; + } + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( nodes, tmp.childNodes ); + + // Remember the top-level container + tmp = fragment.firstChild; + + // Ensure the created nodes are orphaned (#12392) + tmp.textContent = ""; + } + } + } + + // Remove wrapper from fragment + fragment.textContent = ""; + + i = 0; + while ( ( elem = nodes[ i++ ] ) ) { + + // Skip elements already in the context collection (trac-4087) + if ( selection && jQuery.inArray( elem, selection ) > -1 ) { + if ( ignored ) { + ignored.push( elem ); + } + continue; + } + + attached = isAttached( elem ); + + // Append to fragment + tmp = getAll( fragment.appendChild( elem ), "script" ); + + // Preserve script evaluation history + if ( attached ) { + setGlobalEval( tmp ); + } + + // Capture executables + if ( scripts ) { + j = 0; + while ( ( elem = tmp[ j++ ] ) ) { + if ( rscriptType.test( elem.type || "" ) ) { + scripts.push( elem ); + } + } + } + } + + return fragment; +} + + +var + rkeyEvent = /^key/, + rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, + rtypenamespace = /^([^.]*)(?:\.(.+)|)/; + +function returnTrue() { + return true; +} + +function returnFalse() { + return false; +} + +// Support: IE <=9 - 11+ +// focus() and blur() are asynchronous, except when they are no-op. +// So expect focus to be synchronous when the element is already active, +// and blur to be synchronous when the element is not already active. +// (focus and blur are always synchronous in other supported browsers, +// this just defines when we can count on it). +function expectSync( elem, type ) { + return ( elem === safeActiveElement() ) === ( type === "focus" ); +} + +// Support: IE <=9 only +// Accessing document.activeElement can throw unexpectedly +// https://bugs.jquery.com/ticket/13393 +function safeActiveElement() { + try { + return document.activeElement; + } catch ( err ) { } +} + +function on( elem, types, selector, data, fn, one ) { + var origFn, type; + + // Types can be a map of types/handlers + if ( typeof types === "object" ) { + + // ( types-Object, selector, data ) + if ( typeof selector !== "string" ) { + + // ( types-Object, data ) + data = data || selector; + selector = undefined; + } + for ( type in types ) { + on( elem, type, selector, data, types[ type ], one ); + } + return elem; + } + + if ( data == null && fn == null ) { + + // ( types, fn ) + fn = selector; + data = selector = undefined; + } else if ( fn == null ) { + if ( typeof selector === "string" ) { + + // ( types, selector, fn ) + fn = data; + data = undefined; + } else { + + // ( types, data, fn ) + fn = data; + data = selector; + selector = undefined; + } + } + if ( fn === false ) { + fn = returnFalse; + } else if ( !fn ) { + return elem; + } + + if ( one === 1 ) { + origFn = fn; + fn = function( event ) { + + // Can use an empty set, since event contains the info + jQuery().off( event ); + return origFn.apply( this, arguments ); + }; + + // Use same guid so caller can remove using origFn + fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); + } + return elem.each( function() { + jQuery.event.add( this, types, fn, data, selector ); + } ); +} + +/* + * Helper functions for managing events -- not part of the public interface. + * Props to Dean Edwards' addEvent library for many of the ideas. + */ +jQuery.event = { + + global: {}, + + add: function( elem, types, handler, data, selector ) { + + var handleObjIn, eventHandle, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.get( elem ); + + // Only attach events to objects that accept data + if ( !acceptData( elem ) ) { + return; + } + + // Caller can pass in an object of custom data in lieu of the handler + if ( handler.handler ) { + handleObjIn = handler; + handler = handleObjIn.handler; + selector = handleObjIn.selector; + } + + // Ensure that invalid selectors throw exceptions at attach time + // Evaluate against documentElement in case elem is a non-element node (e.g., document) + if ( selector ) { + jQuery.find.matchesSelector( documentElement, selector ); + } + + // Make sure that the handler has a unique ID, used to find/remove it later + if ( !handler.guid ) { + handler.guid = jQuery.guid++; + } + + // Init the element's event structure and main handler, if this is the first + if ( !( events = elemData.events ) ) { + events = elemData.events = Object.create( null ); + } + if ( !( eventHandle = elemData.handle ) ) { + eventHandle = elemData.handle = function( e ) { + + // Discard the second event of a jQuery.event.trigger() and + // when an event is called after a page has unloaded + return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? + jQuery.event.dispatch.apply( elem, arguments ) : undefined; + }; + } + + // Handle multiple events separated by a space + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // There *must* be a type, no attaching namespace-only handlers + if ( !type ) { + continue; + } + + // If event changes its type, use the special event handlers for the changed type + special = jQuery.event.special[ type ] || {}; + + // If selector defined, determine special event api type, otherwise given type + type = ( selector ? special.delegateType : special.bindType ) || type; + + // Update special based on newly reset type + special = jQuery.event.special[ type ] || {}; + + // handleObj is passed to all event handlers + handleObj = jQuery.extend( { + type: type, + origType: origType, + data: data, + handler: handler, + guid: handler.guid, + selector: selector, + needsContext: selector && jQuery.expr.match.needsContext.test( selector ), + namespace: namespaces.join( "." ) + }, handleObjIn ); + + // Init the event handler queue if we're the first + if ( !( handlers = events[ type ] ) ) { + handlers = events[ type ] = []; + handlers.delegateCount = 0; + + // Only use addEventListener if the special events handler returns false + if ( !special.setup || + special.setup.call( elem, data, namespaces, eventHandle ) === false ) { + + if ( elem.addEventListener ) { + elem.addEventListener( type, eventHandle ); + } + } + } + + if ( special.add ) { + special.add.call( elem, handleObj ); + + if ( !handleObj.handler.guid ) { + handleObj.handler.guid = handler.guid; + } + } + + // Add to the element's handler list, delegates in front + if ( selector ) { + handlers.splice( handlers.delegateCount++, 0, handleObj ); + } else { + handlers.push( handleObj ); + } + + // Keep track of which events have ever been used, for event optimization + jQuery.event.global[ type ] = true; + } + + }, + + // Detach an event or set of events from an element + remove: function( elem, types, handler, selector, mappedTypes ) { + + var j, origCount, tmp, + events, t, handleObj, + special, handlers, type, namespaces, origType, + elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); + + if ( !elemData || !( events = elemData.events ) ) { + return; + } + + // Once for each type.namespace in types; type may be omitted + types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; + t = types.length; + while ( t-- ) { + tmp = rtypenamespace.exec( types[ t ] ) || []; + type = origType = tmp[ 1 ]; + namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); + + // Unbind all events (on this namespace, if provided) for the element + if ( !type ) { + for ( type in events ) { + jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); + } + continue; + } + + special = jQuery.event.special[ type ] || {}; + type = ( selector ? special.delegateType : special.bindType ) || type; + handlers = events[ type ] || []; + tmp = tmp[ 2 ] && + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); + + // Remove matching events + origCount = j = handlers.length; + while ( j-- ) { + handleObj = handlers[ j ]; + + if ( ( mappedTypes || origType === handleObj.origType ) && + ( !handler || handler.guid === handleObj.guid ) && + ( !tmp || tmp.test( handleObj.namespace ) ) && + ( !selector || selector === handleObj.selector || + selector === "**" && handleObj.selector ) ) { + handlers.splice( j, 1 ); + + if ( handleObj.selector ) { + handlers.delegateCount--; + } + if ( special.remove ) { + special.remove.call( elem, handleObj ); + } + } + } + + // Remove generic event handler if we removed something and no more handlers exist + // (avoids potential for endless recursion during removal of special event handlers) + if ( origCount && !handlers.length ) { + if ( !special.teardown || + special.teardown.call( elem, namespaces, elemData.handle ) === false ) { + + jQuery.removeEvent( elem, type, elemData.handle ); + } + + delete events[ type ]; + } + } + + // Remove data and the expando if it's no longer used + if ( jQuery.isEmptyObject( events ) ) { + dataPriv.remove( elem, "handle events" ); + } + }, + + dispatch: function( nativeEvent ) { + + var i, j, ret, matched, handleObj, handlerQueue, + args = new Array( arguments.length ), + + // Make a writable jQuery.Event from the native event object + event = jQuery.event.fix( nativeEvent ), + + handlers = ( + dataPriv.get( this, "events" ) || Object.create( null ) + )[ event.type ] || [], + special = jQuery.event.special[ event.type ] || {}; + + // Use the fix-ed jQuery.Event rather than the (read-only) native event + args[ 0 ] = event; + + for ( i = 1; i < arguments.length; i++ ) { + args[ i ] = arguments[ i ]; + } + + event.delegateTarget = this; + + // Call the preDispatch hook for the mapped type, and let it bail if desired + if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { + return; + } + + // Determine handlers + handlerQueue = jQuery.event.handlers.call( this, event, handlers ); + + // Run delegates first; they may want to stop propagation beneath us + i = 0; + while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { + event.currentTarget = matched.elem; + + j = 0; + while ( ( handleObj = matched.handlers[ j++ ] ) && + !event.isImmediatePropagationStopped() ) { + + // If the event is namespaced, then each handler is only invoked if it is + // specially universal or its namespaces are a superset of the event's. + if ( !event.rnamespace || handleObj.namespace === false || + event.rnamespace.test( handleObj.namespace ) ) { + + event.handleObj = handleObj; + event.data = handleObj.data; + + ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || + handleObj.handler ).apply( matched.elem, args ); + + if ( ret !== undefined ) { + if ( ( event.result = ret ) === false ) { + event.preventDefault(); + event.stopPropagation(); + } + } + } + } + } + + // Call the postDispatch hook for the mapped type + if ( special.postDispatch ) { + special.postDispatch.call( this, event ); + } + + return event.result; + }, + + handlers: function( event, handlers ) { + var i, handleObj, sel, matchedHandlers, matchedSelectors, + handlerQueue = [], + delegateCount = handlers.delegateCount, + cur = event.target; + + // Find delegate handlers + if ( delegateCount && + + // Support: IE <=9 + // Black-hole SVG instance trees (trac-13180) + cur.nodeType && + + // Support: Firefox <=42 + // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) + // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click + // Support: IE 11 only + // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) + !( event.type === "click" && event.button >= 1 ) ) { + + for ( ; cur !== this; cur = cur.parentNode || this ) { + + // Don't check non-elements (#13208) + // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) + if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { + matchedHandlers = []; + matchedSelectors = {}; + for ( i = 0; i < delegateCount; i++ ) { + handleObj = handlers[ i ]; + + // Don't conflict with Object.prototype properties (#13203) + sel = handleObj.selector + " "; + + if ( matchedSelectors[ sel ] === undefined ) { + matchedSelectors[ sel ] = handleObj.needsContext ? + jQuery( sel, this ).index( cur ) > -1 : + jQuery.find( sel, this, null, [ cur ] ).length; + } + if ( matchedSelectors[ sel ] ) { + matchedHandlers.push( handleObj ); + } + } + if ( matchedHandlers.length ) { + handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); + } + } + } + } + + // Add the remaining (directly-bound) handlers + cur = this; + if ( delegateCount < handlers.length ) { + handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); + } + + return handlerQueue; + }, + + addProp: function( name, hook ) { + Object.defineProperty( jQuery.Event.prototype, name, { + enumerable: true, + configurable: true, + + get: isFunction( hook ) ? + function() { + if ( this.originalEvent ) { + return hook( this.originalEvent ); + } + } : + function() { + if ( this.originalEvent ) { + return this.originalEvent[ name ]; + } + }, + + set: function( value ) { + Object.defineProperty( this, name, { + enumerable: true, + configurable: true, + writable: true, + value: value + } ); + } + } ); + }, + + fix: function( originalEvent ) { + return originalEvent[ jQuery.expando ] ? + originalEvent : + new jQuery.Event( originalEvent ); + }, + + special: { + load: { + + // Prevent triggered image.load events from bubbling to window.load + noBubble: true + }, + click: { + + // Utilize native event to ensure correct state for checkable inputs + setup: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Claim the first handler + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + // dataPriv.set( el, "click", ... ) + leverageNative( el, "click", returnTrue ); + } + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function( data ) { + + // For mutual compressibility with _default, replace `this` access with a local var. + // `|| data` is dead code meant only to preserve the variable through minification. + var el = this || data; + + // Force setup before triggering a click + if ( rcheckableType.test( el.type ) && + el.click && nodeName( el, "input" ) ) { + + leverageNative( el, "click" ); + } + + // Return non-false to allow normal event-path propagation + return true; + }, + + // For cross-browser consistency, suppress native .click() on links + // Also prevent it if we're currently inside a leveraged native-event stack + _default: function( event ) { + var target = event.target; + return rcheckableType.test( target.type ) && + target.click && nodeName( target, "input" ) && + dataPriv.get( target, "click" ) || + nodeName( target, "a" ); + } + }, + + beforeunload: { + postDispatch: function( event ) { + + // Support: Firefox 20+ + // Firefox doesn't alert if the returnValue field is not set. + if ( event.result !== undefined && event.originalEvent ) { + event.originalEvent.returnValue = event.result; + } + } + } + } +}; + +// Ensure the presence of an event listener that handles manually-triggered +// synthetic events by interrupting progress until reinvoked in response to +// *native* events that it fires directly, ensuring that state changes have +// already occurred before other listeners are invoked. +function leverageNative( el, type, expectSync ) { + + // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add + if ( !expectSync ) { + if ( dataPriv.get( el, type ) === undefined ) { + jQuery.event.add( el, type, returnTrue ); + } + return; + } + + // Register the controller as a special universal handler for all event namespaces + dataPriv.set( el, type, false ); + jQuery.event.add( el, type, { + namespace: false, + handler: function( event ) { + var notAsync, result, + saved = dataPriv.get( this, type ); + + if ( ( event.isTrigger & 1 ) && this[ type ] ) { + + // Interrupt processing of the outer synthetic .trigger()ed event + // Saved data should be false in such cases, but might be a leftover capture object + // from an async native handler (gh-4350) + if ( !saved.length ) { + + // Store arguments for use when handling the inner native event + // There will always be at least one argument (an event object), so this array + // will not be confused with a leftover capture object. + saved = slice.call( arguments ); + dataPriv.set( this, type, saved ); + + // Trigger the native event and capture its result + // Support: IE <=9 - 11+ + // focus() and blur() are asynchronous + notAsync = expectSync( this, type ); + this[ type ](); + result = dataPriv.get( this, type ); + if ( saved !== result || notAsync ) { + dataPriv.set( this, type, false ); + } else { + result = {}; + } + if ( saved !== result ) { + + // Cancel the outer synthetic event + event.stopImmediatePropagation(); + event.preventDefault(); + return result.value; + } + + // If this is an inner synthetic event for an event with a bubbling surrogate + // (focus or blur), assume that the surrogate already propagated from triggering the + // native event and prevent that from happening again here. + // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the + // bubbling surrogate propagates *after* the non-bubbling base), but that seems + // less bad than duplication. + } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { + event.stopPropagation(); + } + + // If this is a native event triggered above, everything is now in order + // Fire an inner synthetic event with the original arguments + } else if ( saved.length ) { + + // ...and capture the result + dataPriv.set( this, type, { + value: jQuery.event.trigger( + + // Support: IE <=9 - 11+ + // Extend with the prototype to reset the above stopImmediatePropagation() + jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), + saved.slice( 1 ), + this + ) + } ); + + // Abort handling of the native event + event.stopImmediatePropagation(); + } + } + } ); +} + +jQuery.removeEvent = function( elem, type, handle ) { + + // This "if" is needed for plain objects + if ( elem.removeEventListener ) { + elem.removeEventListener( type, handle ); + } +}; + +jQuery.Event = function( src, props ) { + + // Allow instantiation without the 'new' keyword + if ( !( this instanceof jQuery.Event ) ) { + return new jQuery.Event( src, props ); + } + + // Event object + if ( src && src.type ) { + this.originalEvent = src; + this.type = src.type; + + // Events bubbling up the document may have been marked as prevented + // by a handler lower down the tree; reflect the correct value. + this.isDefaultPrevented = src.defaultPrevented || + src.defaultPrevented === undefined && + + // Support: Android <=2.3 only + src.returnValue === false ? + returnTrue : + returnFalse; + + // Create target properties + // Support: Safari <=6 - 7 only + // Target should not be a text node (#504, #13143) + this.target = ( src.target && src.target.nodeType === 3 ) ? + src.target.parentNode : + src.target; + + this.currentTarget = src.currentTarget; + this.relatedTarget = src.relatedTarget; + + // Event type + } else { + this.type = src; + } + + // Put explicitly provided properties onto the event object + if ( props ) { + jQuery.extend( this, props ); + } + + // Create a timestamp if incoming event doesn't have one + this.timeStamp = src && src.timeStamp || Date.now(); + + // Mark it as fixed + this[ jQuery.expando ] = true; +}; + +// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding +// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html +jQuery.Event.prototype = { + constructor: jQuery.Event, + isDefaultPrevented: returnFalse, + isPropagationStopped: returnFalse, + isImmediatePropagationStopped: returnFalse, + isSimulated: false, + + preventDefault: function() { + var e = this.originalEvent; + + this.isDefaultPrevented = returnTrue; + + if ( e && !this.isSimulated ) { + e.preventDefault(); + } + }, + stopPropagation: function() { + var e = this.originalEvent; + + this.isPropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopPropagation(); + } + }, + stopImmediatePropagation: function() { + var e = this.originalEvent; + + this.isImmediatePropagationStopped = returnTrue; + + if ( e && !this.isSimulated ) { + e.stopImmediatePropagation(); + } + + this.stopPropagation(); + } +}; + +// Includes all common event props including KeyEvent and MouseEvent specific props +jQuery.each( { + altKey: true, + bubbles: true, + cancelable: true, + changedTouches: true, + ctrlKey: true, + detail: true, + eventPhase: true, + metaKey: true, + pageX: true, + pageY: true, + shiftKey: true, + view: true, + "char": true, + code: true, + charCode: true, + key: true, + keyCode: true, + button: true, + buttons: true, + clientX: true, + clientY: true, + offsetX: true, + offsetY: true, + pointerId: true, + pointerType: true, + screenX: true, + screenY: true, + targetTouches: true, + toElement: true, + touches: true, + + which: function( event ) { + var button = event.button; + + // Add which for key events + if ( event.which == null && rkeyEvent.test( event.type ) ) { + return event.charCode != null ? event.charCode : event.keyCode; + } + + // Add which for click: 1 === left; 2 === middle; 3 === right + if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { + if ( button & 1 ) { + return 1; + } + + if ( button & 2 ) { + return 3; + } + + if ( button & 4 ) { + return 2; + } + + return 0; + } + + return event.which; + } +}, jQuery.event.addProp ); + +jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { + jQuery.event.special[ type ] = { + + // Utilize native event if possible so blur/focus sequence is correct + setup: function() { + + // Claim the first handler + // dataPriv.set( this, "focus", ... ) + // dataPriv.set( this, "blur", ... ) + leverageNative( this, type, expectSync ); + + // Return false to allow normal processing in the caller + return false; + }, + trigger: function() { + + // Force setup before trigger + leverageNative( this, type ); + + // Return non-false to allow normal event-path propagation + return true; + }, + + delegateType: delegateType + }; +} ); + +// Create mouseenter/leave events using mouseover/out and event-time checks +// so that event delegation works in jQuery. +// Do the same for pointerenter/pointerleave and pointerover/pointerout +// +// Support: Safari 7 only +// Safari sends mouseenter too often; see: +// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 +// for the description of the bug (it existed in older Chrome versions as well). +jQuery.each( { + mouseenter: "mouseover", + mouseleave: "mouseout", + pointerenter: "pointerover", + pointerleave: "pointerout" +}, function( orig, fix ) { + jQuery.event.special[ orig ] = { + delegateType: fix, + bindType: fix, + + handle: function( event ) { + var ret, + target = this, + related = event.relatedTarget, + handleObj = event.handleObj; + + // For mouseenter/leave call the handler if related is outside the target. + // NB: No relatedTarget if the mouse left/entered the browser window + if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { + event.type = handleObj.origType; + ret = handleObj.handler.apply( this, arguments ); + event.type = fix; + } + return ret; + } + }; +} ); + +jQuery.fn.extend( { + + on: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn ); + }, + one: function( types, selector, data, fn ) { + return on( this, types, selector, data, fn, 1 ); + }, + off: function( types, selector, fn ) { + var handleObj, type; + if ( types && types.preventDefault && types.handleObj ) { + + // ( event ) dispatched jQuery.Event + handleObj = types.handleObj; + jQuery( types.delegateTarget ).off( + handleObj.namespace ? + handleObj.origType + "." + handleObj.namespace : + handleObj.origType, + handleObj.selector, + handleObj.handler + ); + return this; + } + if ( typeof types === "object" ) { + + // ( types-object [, selector] ) + for ( type in types ) { + this.off( type, selector, types[ type ] ); + } + return this; + } + if ( selector === false || typeof selector === "function" ) { + + // ( types [, fn] ) + fn = selector; + selector = undefined; + } + if ( fn === false ) { + fn = returnFalse; + } + return this.each( function() { + jQuery.event.remove( this, types, fn, selector ); + } ); + } +} ); + + +var + + // Support: IE <=10 - 11, Edge 12 - 13 only + // In IE/Edge using regex groups here causes severe slowdowns. + // See https://connect.microsoft.com/IE/feedback/details/1736512/ + rnoInnerhtml = /\s*$/g; + +// Prefer a tbody over its parent table for containing new rows +function manipulationTarget( elem, content ) { + if ( nodeName( elem, "table" ) && + nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { + + return jQuery( elem ).children( "tbody" )[ 0 ] || elem; + } + + return elem; +} + +// Replace/restore the type attribute of script elements for safe DOM manipulation +function disableScript( elem ) { + elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; + return elem; +} +function restoreScript( elem ) { + if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { + elem.type = elem.type.slice( 5 ); + } else { + elem.removeAttribute( "type" ); + } + + return elem; +} + +function cloneCopyEvent( src, dest ) { + var i, l, type, pdataOld, udataOld, udataCur, events; + + if ( dest.nodeType !== 1 ) { + return; + } + + // 1. Copy private data: events, handlers, etc. + if ( dataPriv.hasData( src ) ) { + pdataOld = dataPriv.get( src ); + events = pdataOld.events; + + if ( events ) { + dataPriv.remove( dest, "handle events" ); + + for ( type in events ) { + for ( i = 0, l = events[ type ].length; i < l; i++ ) { + jQuery.event.add( dest, type, events[ type ][ i ] ); + } + } + } + } + + // 2. Copy user data + if ( dataUser.hasData( src ) ) { + udataOld = dataUser.access( src ); + udataCur = jQuery.extend( {}, udataOld ); + + dataUser.set( dest, udataCur ); + } +} + +// Fix IE bugs, see support tests +function fixInput( src, dest ) { + var nodeName = dest.nodeName.toLowerCase(); + + // Fails to persist the checked state of a cloned checkbox or radio button. + if ( nodeName === "input" && rcheckableType.test( src.type ) ) { + dest.checked = src.checked; + + // Fails to return the selected option to the default selected state when cloning options + } else if ( nodeName === "input" || nodeName === "textarea" ) { + dest.defaultValue = src.defaultValue; + } +} + +function domManip( collection, args, callback, ignored ) { + + // Flatten any nested arrays + args = flat( args ); + + var fragment, first, scripts, hasScripts, node, doc, + i = 0, + l = collection.length, + iNoClone = l - 1, + value = args[ 0 ], + valueIsFunction = isFunction( value ); + + // We can't cloneNode fragments that contain checked, in WebKit + if ( valueIsFunction || + ( l > 1 && typeof value === "string" && + !support.checkClone && rchecked.test( value ) ) ) { + return collection.each( function( index ) { + var self = collection.eq( index ); + if ( valueIsFunction ) { + args[ 0 ] = value.call( this, index, self.html() ); + } + domManip( self, args, callback, ignored ); + } ); + } + + if ( l ) { + fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); + first = fragment.firstChild; + + if ( fragment.childNodes.length === 1 ) { + fragment = first; + } + + // Require either new content or an interest in ignored elements to invoke the callback + if ( first || ignored ) { + scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); + hasScripts = scripts.length; + + // Use the original fragment for the last item + // instead of the first because it can end up + // being emptied incorrectly in certain situations (#8070). + for ( ; i < l; i++ ) { + node = fragment; + + if ( i !== iNoClone ) { + node = jQuery.clone( node, true, true ); + + // Keep references to cloned scripts for later restoration + if ( hasScripts ) { + + // Support: Android <=4.0 only, PhantomJS 1 only + // push.apply(_, arraylike) throws on ancient WebKit + jQuery.merge( scripts, getAll( node, "script" ) ); + } + } + + callback.call( collection[ i ], node, i ); + } + + if ( hasScripts ) { + doc = scripts[ scripts.length - 1 ].ownerDocument; + + // Reenable scripts + jQuery.map( scripts, restoreScript ); + + // Evaluate executable scripts on first document insertion + for ( i = 0; i < hasScripts; i++ ) { + node = scripts[ i ]; + if ( rscriptType.test( node.type || "" ) && + !dataPriv.access( node, "globalEval" ) && + jQuery.contains( doc, node ) ) { + + if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { + + // Optional AJAX dependency, but won't run scripts if not present + if ( jQuery._evalUrl && !node.noModule ) { + jQuery._evalUrl( node.src, { + nonce: node.nonce || node.getAttribute( "nonce" ) + }, doc ); + } + } else { + DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); + } + } + } + } + } + } + + return collection; +} + +function remove( elem, selector, keepData ) { + var node, + nodes = selector ? jQuery.filter( selector, elem ) : elem, + i = 0; + + for ( ; ( node = nodes[ i ] ) != null; i++ ) { + if ( !keepData && node.nodeType === 1 ) { + jQuery.cleanData( getAll( node ) ); + } + + if ( node.parentNode ) { + if ( keepData && isAttached( node ) ) { + setGlobalEval( getAll( node, "script" ) ); + } + node.parentNode.removeChild( node ); + } + } + + return elem; +} + +jQuery.extend( { + htmlPrefilter: function( html ) { + return html; + }, + + clone: function( elem, dataAndEvents, deepDataAndEvents ) { + var i, l, srcElements, destElements, + clone = elem.cloneNode( true ), + inPage = isAttached( elem ); + + // Fix IE cloning issues + if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && + !jQuery.isXMLDoc( elem ) ) { + + // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 + destElements = getAll( clone ); + srcElements = getAll( elem ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + fixInput( srcElements[ i ], destElements[ i ] ); + } + } + + // Copy the events from the original to the clone + if ( dataAndEvents ) { + if ( deepDataAndEvents ) { + srcElements = srcElements || getAll( elem ); + destElements = destElements || getAll( clone ); + + for ( i = 0, l = srcElements.length; i < l; i++ ) { + cloneCopyEvent( srcElements[ i ], destElements[ i ] ); + } + } else { + cloneCopyEvent( elem, clone ); + } + } + + // Preserve script evaluation history + destElements = getAll( clone, "script" ); + if ( destElements.length > 0 ) { + setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); + } + + // Return the cloned set + return clone; + }, + + cleanData: function( elems ) { + var data, elem, type, + special = jQuery.event.special, + i = 0; + + for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { + if ( acceptData( elem ) ) { + if ( ( data = elem[ dataPriv.expando ] ) ) { + if ( data.events ) { + for ( type in data.events ) { + if ( special[ type ] ) { + jQuery.event.remove( elem, type ); + + // This is a shortcut to avoid jQuery.event.remove's overhead + } else { + jQuery.removeEvent( elem, type, data.handle ); + } + } + } + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataPriv.expando ] = undefined; + } + if ( elem[ dataUser.expando ] ) { + + // Support: Chrome <=35 - 45+ + // Assign undefined instead of using delete, see Data#remove + elem[ dataUser.expando ] = undefined; + } + } + } + } +} ); + +jQuery.fn.extend( { + detach: function( selector ) { + return remove( this, selector, true ); + }, + + remove: function( selector ) { + return remove( this, selector ); + }, + + text: function( value ) { + return access( this, function( value ) { + return value === undefined ? + jQuery.text( this ) : + this.empty().each( function() { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + this.textContent = value; + } + } ); + }, null, value, arguments.length ); + }, + + append: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.appendChild( elem ); + } + } ); + }, + + prepend: function() { + return domManip( this, arguments, function( elem ) { + if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { + var target = manipulationTarget( this, elem ); + target.insertBefore( elem, target.firstChild ); + } + } ); + }, + + before: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this ); + } + } ); + }, + + after: function() { + return domManip( this, arguments, function( elem ) { + if ( this.parentNode ) { + this.parentNode.insertBefore( elem, this.nextSibling ); + } + } ); + }, + + empty: function() { + var elem, + i = 0; + + for ( ; ( elem = this[ i ] ) != null; i++ ) { + if ( elem.nodeType === 1 ) { + + // Prevent memory leaks + jQuery.cleanData( getAll( elem, false ) ); + + // Remove any remaining nodes + elem.textContent = ""; + } + } + + return this; + }, + + clone: function( dataAndEvents, deepDataAndEvents ) { + dataAndEvents = dataAndEvents == null ? false : dataAndEvents; + deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; + + return this.map( function() { + return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); + } ); + }, + + html: function( value ) { + return access( this, function( value ) { + var elem = this[ 0 ] || {}, + i = 0, + l = this.length; + + if ( value === undefined && elem.nodeType === 1 ) { + return elem.innerHTML; + } + + // See if we can take a shortcut and just use innerHTML + if ( typeof value === "string" && !rnoInnerhtml.test( value ) && + !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { + + value = jQuery.htmlPrefilter( value ); + + try { + for ( ; i < l; i++ ) { + elem = this[ i ] || {}; + + // Remove element nodes and prevent memory leaks + if ( elem.nodeType === 1 ) { + jQuery.cleanData( getAll( elem, false ) ); + elem.innerHTML = value; + } + } + + elem = 0; + + // If using innerHTML throws an exception, use the fallback method + } catch ( e ) {} + } + + if ( elem ) { + this.empty().append( value ); + } + }, null, value, arguments.length ); + }, + + replaceWith: function() { + var ignored = []; + + // Make the changes, replacing each non-ignored context element with the new content + return domManip( this, arguments, function( elem ) { + var parent = this.parentNode; + + if ( jQuery.inArray( this, ignored ) < 0 ) { + jQuery.cleanData( getAll( this ) ); + if ( parent ) { + parent.replaceChild( elem, this ); + } + } + + // Force callback invocation + }, ignored ); + } +} ); + +jQuery.each( { + appendTo: "append", + prependTo: "prepend", + insertBefore: "before", + insertAfter: "after", + replaceAll: "replaceWith" +}, function( name, original ) { + jQuery.fn[ name ] = function( selector ) { + var elems, + ret = [], + insert = jQuery( selector ), + last = insert.length - 1, + i = 0; + + for ( ; i <= last; i++ ) { + elems = i === last ? this : this.clone( true ); + jQuery( insert[ i ] )[ original ]( elems ); + + // Support: Android <=4.0 only, PhantomJS 1 only + // .get() because push.apply(_, arraylike) throws on ancient WebKit + push.apply( ret, elems.get() ); + } + + return this.pushStack( ret ); + }; +} ); +var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); + +var getStyles = function( elem ) { + + // Support: IE <=11 only, Firefox <=30 (#15098, #14150) + // IE throws on elements created in popups + // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" + var view = elem.ownerDocument.defaultView; + + if ( !view || !view.opener ) { + view = window; + } + + return view.getComputedStyle( elem ); + }; + +var swap = function( elem, options, callback ) { + var ret, name, + old = {}; + + // Remember the old values, and insert the new ones + for ( name in options ) { + old[ name ] = elem.style[ name ]; + elem.style[ name ] = options[ name ]; + } + + ret = callback.call( elem ); + + // Revert the old values + for ( name in options ) { + elem.style[ name ] = old[ name ]; + } + + return ret; +}; + + +var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); + + + +( function() { + + // Executing both pixelPosition & boxSizingReliable tests require only one layout + // so they're executed at the same time to save the second computation. + function computeStyleTests() { + + // This is a singleton, we need to execute it only once + if ( !div ) { + return; + } + + container.style.cssText = "position:absolute;left:-11111px;width:60px;" + + "margin-top:1px;padding:0;border:0"; + div.style.cssText = + "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + + "margin:auto;border:1px;padding:1px;" + + "width:60%;top:1%"; + documentElement.appendChild( container ).appendChild( div ); + + var divStyle = window.getComputedStyle( div ); + pixelPositionVal = divStyle.top !== "1%"; + + // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 + reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; + + // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 + // Some styles come back with percentage values, even though they shouldn't + div.style.right = "60%"; + pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; + + // Support: IE 9 - 11 only + // Detect misreporting of content dimensions for box-sizing:border-box elements + boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; + + // Support: IE 9 only + // Detect overflow:scroll screwiness (gh-3699) + // Support: Chrome <=64 + // Don't get tricked when zoom affects offsetWidth (gh-4029) + div.style.position = "absolute"; + scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; + + documentElement.removeChild( container ); + + // Nullify the div so it wouldn't be stored in the memory and + // it will also be a sign that checks already performed + div = null; + } + + function roundPixelMeasures( measure ) { + return Math.round( parseFloat( measure ) ); + } + + var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, + reliableTrDimensionsVal, reliableMarginLeftVal, + container = document.createElement( "div" ), + div = document.createElement( "div" ); + + // Finish early in limited (non-browser) environments + if ( !div.style ) { + return; + } + + // Support: IE <=9 - 11 only + // Style of cloned element affects source element cloned (#8908) + div.style.backgroundClip = "content-box"; + div.cloneNode( true ).style.backgroundClip = ""; + support.clearCloneStyle = div.style.backgroundClip === "content-box"; + + jQuery.extend( support, { + boxSizingReliable: function() { + computeStyleTests(); + return boxSizingReliableVal; + }, + pixelBoxStyles: function() { + computeStyleTests(); + return pixelBoxStylesVal; + }, + pixelPosition: function() { + computeStyleTests(); + return pixelPositionVal; + }, + reliableMarginLeft: function() { + computeStyleTests(); + return reliableMarginLeftVal; + }, + scrollboxSize: function() { + computeStyleTests(); + return scrollboxSizeVal; + }, + + // Support: IE 9 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Behavior in IE 9 is more subtle than in newer versions & it passes + // some versions of this test; make sure not to make it pass there! + reliableTrDimensions: function() { + var table, tr, trChild, trStyle; + if ( reliableTrDimensionsVal == null ) { + table = document.createElement( "table" ); + tr = document.createElement( "tr" ); + trChild = document.createElement( "div" ); + + table.style.cssText = "position:absolute;left:-11111px"; + tr.style.height = "1px"; + trChild.style.height = "9px"; + + documentElement + .appendChild( table ) + .appendChild( tr ) + .appendChild( trChild ); + + trStyle = window.getComputedStyle( tr ); + reliableTrDimensionsVal = parseInt( trStyle.height ) > 3; + + documentElement.removeChild( table ); + } + return reliableTrDimensionsVal; + } + } ); +} )(); + + +function curCSS( elem, name, computed ) { + var width, minWidth, maxWidth, ret, + + // Support: Firefox 51+ + // Retrieving style before computed somehow + // fixes an issue with getting wrong values + // on detached elements + style = elem.style; + + computed = computed || getStyles( elem ); + + // getPropertyValue is needed for: + // .css('filter') (IE 9 only, #12537) + // .css('--customProperty) (#3144) + if ( computed ) { + ret = computed.getPropertyValue( name ) || computed[ name ]; + + if ( ret === "" && !isAttached( elem ) ) { + ret = jQuery.style( elem, name ); + } + + // A tribute to the "awesome hack by Dean Edwards" + // Android Browser returns percentage for some values, + // but width seems to be reliably pixels. + // This is against the CSSOM draft spec: + // https://drafts.csswg.org/cssom/#resolved-values + if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { + + // Remember the original values + width = style.width; + minWidth = style.minWidth; + maxWidth = style.maxWidth; + + // Put in the new values to get a computed value out + style.minWidth = style.maxWidth = style.width = ret; + ret = computed.width; + + // Revert the changed values + style.width = width; + style.minWidth = minWidth; + style.maxWidth = maxWidth; + } + } + + return ret !== undefined ? + + // Support: IE <=9 - 11 only + // IE returns zIndex value as an integer. + ret + "" : + ret; +} + + +function addGetHookIf( conditionFn, hookFn ) { + + // Define the hook, we'll check on the first run if it's really needed. + return { + get: function() { + if ( conditionFn() ) { + + // Hook not needed (or it's not possible to use it due + // to missing dependency), remove it. + delete this.get; + return; + } + + // Hook needed; redefine it so that the support test is not executed again. + return ( this.get = hookFn ).apply( this, arguments ); + } + }; +} + + +var cssPrefixes = [ "Webkit", "Moz", "ms" ], + emptyStyle = document.createElement( "div" ).style, + vendorProps = {}; + +// Return a vendor-prefixed property or undefined +function vendorPropName( name ) { + + // Check for vendor prefixed names + var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), + i = cssPrefixes.length; + + while ( i-- ) { + name = cssPrefixes[ i ] + capName; + if ( name in emptyStyle ) { + return name; + } + } +} + +// Return a potentially-mapped jQuery.cssProps or vendor prefixed property +function finalPropName( name ) { + var final = jQuery.cssProps[ name ] || vendorProps[ name ]; + + if ( final ) { + return final; + } + if ( name in emptyStyle ) { + return name; + } + return vendorProps[ name ] = vendorPropName( name ) || name; +} + + +var + + // Swappable if display is none or starts with table + // except "table", "table-cell", or "table-caption" + // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display + rdisplayswap = /^(none|table(?!-c[ea]).+)/, + rcustomProp = /^--/, + cssShow = { position: "absolute", visibility: "hidden", display: "block" }, + cssNormalTransform = { + letterSpacing: "0", + fontWeight: "400" + }; + +function setPositiveNumber( _elem, value, subtract ) { + + // Any relative (+/-) values have already been + // normalized at this point + var matches = rcssNum.exec( value ); + return matches ? + + // Guard against undefined "subtract", e.g., when used as in cssHooks + Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : + value; +} + +function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { + var i = dimension === "width" ? 1 : 0, + extra = 0, + delta = 0; + + // Adjustment may not be necessary + if ( box === ( isBorderBox ? "border" : "content" ) ) { + return 0; + } + + for ( ; i < 4; i += 2 ) { + + // Both box models exclude margin + if ( box === "margin" ) { + delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); + } + + // If we get here with a content-box, we're seeking "padding" or "border" or "margin" + if ( !isBorderBox ) { + + // Add padding + delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + + // For "border" or "margin", add border + if ( box !== "padding" ) { + delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + + // But still keep track of it otherwise + } else { + extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + + // If we get here with a border-box (content + padding + border), we're seeking "content" or + // "padding" or "margin" + } else { + + // For "content", subtract padding + if ( box === "content" ) { + delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); + } + + // For "content" or "padding", subtract border + if ( box !== "margin" ) { + delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); + } + } + } + + // Account for positive content-box scroll gutter when requested by providing computedVal + if ( !isBorderBox && computedVal >= 0 ) { + + // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border + // Assuming integer scroll gutter, subtract the rest and round down + delta += Math.max( 0, Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + computedVal - + delta - + extra - + 0.5 + + // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter + // Use an explicit zero to avoid NaN (gh-3964) + ) ) || 0; + } + + return delta; +} + +function getWidthOrHeight( elem, dimension, extra ) { + + // Start with computed style + var styles = getStyles( elem ), + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). + // Fake content-box until we know it's needed to know the true value. + boxSizingNeeded = !support.boxSizingReliable() || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + valueIsBorderBox = isBorderBox, + + val = curCSS( elem, dimension, styles ), + offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); + + // Support: Firefox <=54 + // Return a confounding non-pixel value or feign ignorance, as appropriate. + if ( rnumnonpx.test( val ) ) { + if ( !extra ) { + return val; + } + val = "auto"; + } + + + // Support: IE 9 - 11 only + // Use offsetWidth/offsetHeight for when box sizing is unreliable. + // In those cases, the computed value can be trusted to be border-box. + if ( ( !support.boxSizingReliable() && isBorderBox || + + // Support: IE 10 - 11+, Edge 15 - 18+ + // IE/Edge misreport `getComputedStyle` of table rows with width/height + // set in CSS while `offset*` properties report correct values. + // Interestingly, in some cases IE 9 doesn't suffer from this issue. + !support.reliableTrDimensions() && nodeName( elem, "tr" ) || + + // Fall back to offsetWidth/offsetHeight when value is "auto" + // This happens for inline elements with no explicit setting (gh-3571) + val === "auto" || + + // Support: Android <=4.1 - 4.3 only + // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) + !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && + + // Make sure the element is visible & connected + elem.getClientRects().length ) { + + isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; + + // Where available, offsetWidth/offsetHeight approximate border box dimensions. + // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the + // retrieved value as a content box dimension. + valueIsBorderBox = offsetProp in elem; + if ( valueIsBorderBox ) { + val = elem[ offsetProp ]; + } + } + + // Normalize "" and auto + val = parseFloat( val ) || 0; + + // Adjust for the element's box model + return ( val + + boxModelAdjustment( + elem, + dimension, + extra || ( isBorderBox ? "border" : "content" ), + valueIsBorderBox, + styles, + + // Provide the current computed size to request scroll gutter calculation (gh-3589) + val + ) + ) + "px"; +} + +jQuery.extend( { + + // Add in style property hooks for overriding the default + // behavior of getting and setting a style property + cssHooks: { + opacity: { + get: function( elem, computed ) { + if ( computed ) { + + // We should always get a number back from opacity + var ret = curCSS( elem, "opacity" ); + return ret === "" ? "1" : ret; + } + } + } + }, + + // Don't automatically add "px" to these possibly-unitless properties + cssNumber: { + "animationIterationCount": true, + "columnCount": true, + "fillOpacity": true, + "flexGrow": true, + "flexShrink": true, + "fontWeight": true, + "gridArea": true, + "gridColumn": true, + "gridColumnEnd": true, + "gridColumnStart": true, + "gridRow": true, + "gridRowEnd": true, + "gridRowStart": true, + "lineHeight": true, + "opacity": true, + "order": true, + "orphans": true, + "widows": true, + "zIndex": true, + "zoom": true + }, + + // Add in properties whose names you wish to fix before + // setting or getting the value + cssProps: {}, + + // Get and set the style property on a DOM Node + style: function( elem, name, value, extra ) { + + // Don't set styles on text and comment nodes + if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { + return; + } + + // Make sure that we're working with the right name + var ret, type, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ), + style = elem.style; + + // Make sure that we're working with the right name. We don't + // want to query the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Gets hook for the prefixed version, then unprefixed version + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // Check if we're setting a value + if ( value !== undefined ) { + type = typeof value; + + // Convert "+=" or "-=" to relative numbers (#7345) + if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { + value = adjustCSS( elem, name, ret ); + + // Fixes bug #9237 + type = "number"; + } + + // Make sure that null and NaN values aren't set (#7116) + if ( value == null || value !== value ) { + return; + } + + // If a number was passed in, add the unit (except for certain CSS properties) + // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append + // "px" to a few hardcoded values. + if ( type === "number" && !isCustomProp ) { + value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); + } + + // background-* props affect original clone's values + if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { + style[ name ] = "inherit"; + } + + // If a hook was provided, use that value, otherwise just set the specified value + if ( !hooks || !( "set" in hooks ) || + ( value = hooks.set( elem, value, extra ) ) !== undefined ) { + + if ( isCustomProp ) { + style.setProperty( name, value ); + } else { + style[ name ] = value; + } + } + + } else { + + // If a hook was provided get the non-computed value from there + if ( hooks && "get" in hooks && + ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { + + return ret; + } + + // Otherwise just get the value from the style object + return style[ name ]; + } + }, + + css: function( elem, name, extra, styles ) { + var val, num, hooks, + origName = camelCase( name ), + isCustomProp = rcustomProp.test( name ); + + // Make sure that we're working with the right name. We don't + // want to modify the value if it is a CSS custom property + // since they are user-defined. + if ( !isCustomProp ) { + name = finalPropName( origName ); + } + + // Try prefixed name followed by the unprefixed name + hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; + + // If a hook was provided get the computed value from there + if ( hooks && "get" in hooks ) { + val = hooks.get( elem, true, extra ); + } + + // Otherwise, if a way to get the computed value exists, use that + if ( val === undefined ) { + val = curCSS( elem, name, styles ); + } + + // Convert "normal" to computed value + if ( val === "normal" && name in cssNormalTransform ) { + val = cssNormalTransform[ name ]; + } + + // Make numeric if forced or a qualifier was provided and val looks numeric + if ( extra === "" || extra ) { + num = parseFloat( val ); + return extra === true || isFinite( num ) ? num || 0 : val; + } + + return val; + } +} ); + +jQuery.each( [ "height", "width" ], function( _i, dimension ) { + jQuery.cssHooks[ dimension ] = { + get: function( elem, computed, extra ) { + if ( computed ) { + + // Certain elements can have dimension info if we invisibly show them + // but it must have a current display style that would benefit + return rdisplayswap.test( jQuery.css( elem, "display" ) ) && + + // Support: Safari 8+ + // Table columns in Safari have non-zero offsetWidth & zero + // getBoundingClientRect().width unless display is changed. + // Support: IE <=11 only + // Running getBoundingClientRect on a disconnected node + // in IE throws an error. + ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? + swap( elem, cssShow, function() { + return getWidthOrHeight( elem, dimension, extra ); + } ) : + getWidthOrHeight( elem, dimension, extra ); + } + }, + + set: function( elem, value, extra ) { + var matches, + styles = getStyles( elem ), + + // Only read styles.position if the test has a chance to fail + // to avoid forcing a reflow. + scrollboxSizeBuggy = !support.scrollboxSize() && + styles.position === "absolute", + + // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) + boxSizingNeeded = scrollboxSizeBuggy || extra, + isBorderBox = boxSizingNeeded && + jQuery.css( elem, "boxSizing", false, styles ) === "border-box", + subtract = extra ? + boxModelAdjustment( + elem, + dimension, + extra, + isBorderBox, + styles + ) : + 0; + + // Account for unreliable border-box dimensions by comparing offset* to computed and + // faking a content-box to get border and padding (gh-3699) + if ( isBorderBox && scrollboxSizeBuggy ) { + subtract -= Math.ceil( + elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - + parseFloat( styles[ dimension ] ) - + boxModelAdjustment( elem, dimension, "border", false, styles ) - + 0.5 + ); + } + + // Convert to pixels if value adjustment is needed + if ( subtract && ( matches = rcssNum.exec( value ) ) && + ( matches[ 3 ] || "px" ) !== "px" ) { + + elem.style[ dimension ] = value; + value = jQuery.css( elem, dimension ); + } + + return setPositiveNumber( elem, value, subtract ); + } + }; +} ); + +jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, + function( elem, computed ) { + if ( computed ) { + return ( parseFloat( curCSS( elem, "marginLeft" ) ) || + elem.getBoundingClientRect().left - + swap( elem, { marginLeft: 0 }, function() { + return elem.getBoundingClientRect().left; + } ) + ) + "px"; + } + } +); + +// These hooks are used by animate to expand properties +jQuery.each( { + margin: "", + padding: "", + border: "Width" +}, function( prefix, suffix ) { + jQuery.cssHooks[ prefix + suffix ] = { + expand: function( value ) { + var i = 0, + expanded = {}, + + // Assumes a single number if not a string + parts = typeof value === "string" ? value.split( " " ) : [ value ]; + + for ( ; i < 4; i++ ) { + expanded[ prefix + cssExpand[ i ] + suffix ] = + parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; + } + + return expanded; + } + }; + + if ( prefix !== "margin" ) { + jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; + } +} ); + +jQuery.fn.extend( { + css: function( name, value ) { + return access( this, function( elem, name, value ) { + var styles, len, + map = {}, + i = 0; + + if ( Array.isArray( name ) ) { + styles = getStyles( elem ); + len = name.length; + + for ( ; i < len; i++ ) { + map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); + } + + return map; + } + + return value !== undefined ? + jQuery.style( elem, name, value ) : + jQuery.css( elem, name ); + }, name, value, arguments.length > 1 ); + } +} ); + + +function Tween( elem, options, prop, end, easing ) { + return new Tween.prototype.init( elem, options, prop, end, easing ); +} +jQuery.Tween = Tween; + +Tween.prototype = { + constructor: Tween, + init: function( elem, options, prop, end, easing, unit ) { + this.elem = elem; + this.prop = prop; + this.easing = easing || jQuery.easing._default; + this.options = options; + this.start = this.now = this.cur(); + this.end = end; + this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); + }, + cur: function() { + var hooks = Tween.propHooks[ this.prop ]; + + return hooks && hooks.get ? + hooks.get( this ) : + Tween.propHooks._default.get( this ); + }, + run: function( percent ) { + var eased, + hooks = Tween.propHooks[ this.prop ]; + + if ( this.options.duration ) { + this.pos = eased = jQuery.easing[ this.easing ]( + percent, this.options.duration * percent, 0, 1, this.options.duration + ); + } else { + this.pos = eased = percent; + } + this.now = ( this.end - this.start ) * eased + this.start; + + if ( this.options.step ) { + this.options.step.call( this.elem, this.now, this ); + } + + if ( hooks && hooks.set ) { + hooks.set( this ); + } else { + Tween.propHooks._default.set( this ); + } + return this; + } +}; + +Tween.prototype.init.prototype = Tween.prototype; + +Tween.propHooks = { + _default: { + get: function( tween ) { + var result; + + // Use a property on the element directly when it is not a DOM element, + // or when there is no matching style property that exists. + if ( tween.elem.nodeType !== 1 || + tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { + return tween.elem[ tween.prop ]; + } + + // Passing an empty string as a 3rd parameter to .css will automatically + // attempt a parseFloat and fallback to a string if the parse fails. + // Simple values such as "10px" are parsed to Float; + // complex values such as "rotate(1rad)" are returned as-is. + result = jQuery.css( tween.elem, tween.prop, "" ); + + // Empty strings, null, undefined and "auto" are converted to 0. + return !result || result === "auto" ? 0 : result; + }, + set: function( tween ) { + + // Use step hook for back compat. + // Use cssHook if its there. + // Use .style if available and use plain properties where available. + if ( jQuery.fx.step[ tween.prop ] ) { + jQuery.fx.step[ tween.prop ]( tween ); + } else if ( tween.elem.nodeType === 1 && ( + jQuery.cssHooks[ tween.prop ] || + tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { + jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); + } else { + tween.elem[ tween.prop ] = tween.now; + } + } + } +}; + +// Support: IE <=9 only +// Panic based approach to setting things on disconnected nodes +Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { + set: function( tween ) { + if ( tween.elem.nodeType && tween.elem.parentNode ) { + tween.elem[ tween.prop ] = tween.now; + } + } +}; + +jQuery.easing = { + linear: function( p ) { + return p; + }, + swing: function( p ) { + return 0.5 - Math.cos( p * Math.PI ) / 2; + }, + _default: "swing" +}; + +jQuery.fx = Tween.prototype.init; + +// Back compat <1.8 extension point +jQuery.fx.step = {}; + + + + +var + fxNow, inProgress, + rfxtypes = /^(?:toggle|show|hide)$/, + rrun = /queueHooks$/; + +function schedule() { + if ( inProgress ) { + if ( document.hidden === false && window.requestAnimationFrame ) { + window.requestAnimationFrame( schedule ); + } else { + window.setTimeout( schedule, jQuery.fx.interval ); + } + + jQuery.fx.tick(); + } +} + +// Animations created synchronously will run synchronously +function createFxNow() { + window.setTimeout( function() { + fxNow = undefined; + } ); + return ( fxNow = Date.now() ); +} + +// Generate parameters to create a standard animation +function genFx( type, includeWidth ) { + var which, + i = 0, + attrs = { height: type }; + + // If we include width, step value is 1 to do all cssExpand values, + // otherwise step value is 2 to skip over Left and Right + includeWidth = includeWidth ? 1 : 0; + for ( ; i < 4; i += 2 - includeWidth ) { + which = cssExpand[ i ]; + attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; + } + + if ( includeWidth ) { + attrs.opacity = attrs.width = type; + } + + return attrs; +} + +function createTween( value, prop, animation ) { + var tween, + collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), + index = 0, + length = collection.length; + for ( ; index < length; index++ ) { + if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { + + // We're done with this property + return tween; + } + } +} + +function defaultPrefilter( elem, props, opts ) { + var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, + isBox = "width" in props || "height" in props, + anim = this, + orig = {}, + style = elem.style, + hidden = elem.nodeType && isHiddenWithinTree( elem ), + dataShow = dataPriv.get( elem, "fxshow" ); + + // Queue-skipping animations hijack the fx hooks + if ( !opts.queue ) { + hooks = jQuery._queueHooks( elem, "fx" ); + if ( hooks.unqueued == null ) { + hooks.unqueued = 0; + oldfire = hooks.empty.fire; + hooks.empty.fire = function() { + if ( !hooks.unqueued ) { + oldfire(); + } + }; + } + hooks.unqueued++; + + anim.always( function() { + + // Ensure the complete handler is called before this completes + anim.always( function() { + hooks.unqueued--; + if ( !jQuery.queue( elem, "fx" ).length ) { + hooks.empty.fire(); + } + } ); + } ); + } + + // Detect show/hide animations + for ( prop in props ) { + value = props[ prop ]; + if ( rfxtypes.test( value ) ) { + delete props[ prop ]; + toggle = toggle || value === "toggle"; + if ( value === ( hidden ? "hide" : "show" ) ) { + + // Pretend to be hidden if this is a "show" and + // there is still data from a stopped show/hide + if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { + hidden = true; + + // Ignore all other no-op show/hide data + } else { + continue; + } + } + orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); + } + } + + // Bail out if this is a no-op like .hide().hide() + propTween = !jQuery.isEmptyObject( props ); + if ( !propTween && jQuery.isEmptyObject( orig ) ) { + return; + } + + // Restrict "overflow" and "display" styles during box animations + if ( isBox && elem.nodeType === 1 ) { + + // Support: IE <=9 - 11, Edge 12 - 15 + // Record all 3 overflow attributes because IE does not infer the shorthand + // from identically-valued overflowX and overflowY and Edge just mirrors + // the overflowX value there. + opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; + + // Identify a display type, preferring old show/hide data over the CSS cascade + restoreDisplay = dataShow && dataShow.display; + if ( restoreDisplay == null ) { + restoreDisplay = dataPriv.get( elem, "display" ); + } + display = jQuery.css( elem, "display" ); + if ( display === "none" ) { + if ( restoreDisplay ) { + display = restoreDisplay; + } else { + + // Get nonempty value(s) by temporarily forcing visibility + showHide( [ elem ], true ); + restoreDisplay = elem.style.display || restoreDisplay; + display = jQuery.css( elem, "display" ); + showHide( [ elem ] ); + } + } + + // Animate inline elements as inline-block + if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { + if ( jQuery.css( elem, "float" ) === "none" ) { + + // Restore the original display value at the end of pure show/hide animations + if ( !propTween ) { + anim.done( function() { + style.display = restoreDisplay; + } ); + if ( restoreDisplay == null ) { + display = style.display; + restoreDisplay = display === "none" ? "" : display; + } + } + style.display = "inline-block"; + } + } + } + + if ( opts.overflow ) { + style.overflow = "hidden"; + anim.always( function() { + style.overflow = opts.overflow[ 0 ]; + style.overflowX = opts.overflow[ 1 ]; + style.overflowY = opts.overflow[ 2 ]; + } ); + } + + // Implement show/hide animations + propTween = false; + for ( prop in orig ) { + + // General show/hide setup for this element animation + if ( !propTween ) { + if ( dataShow ) { + if ( "hidden" in dataShow ) { + hidden = dataShow.hidden; + } + } else { + dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); + } + + // Store hidden/visible for toggle so `.stop().toggle()` "reverses" + if ( toggle ) { + dataShow.hidden = !hidden; + } + + // Show elements before animating them + if ( hidden ) { + showHide( [ elem ], true ); + } + + /* eslint-disable no-loop-func */ + + anim.done( function() { + + /* eslint-enable no-loop-func */ + + // The final step of a "hide" animation is actually hiding the element + if ( !hidden ) { + showHide( [ elem ] ); + } + dataPriv.remove( elem, "fxshow" ); + for ( prop in orig ) { + jQuery.style( elem, prop, orig[ prop ] ); + } + } ); + } + + // Per-property setup + propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); + if ( !( prop in dataShow ) ) { + dataShow[ prop ] = propTween.start; + if ( hidden ) { + propTween.end = propTween.start; + propTween.start = 0; + } + } + } +} + +function propFilter( props, specialEasing ) { + var index, name, easing, value, hooks; + + // camelCase, specialEasing and expand cssHook pass + for ( index in props ) { + name = camelCase( index ); + easing = specialEasing[ name ]; + value = props[ index ]; + if ( Array.isArray( value ) ) { + easing = value[ 1 ]; + value = props[ index ] = value[ 0 ]; + } + + if ( index !== name ) { + props[ name ] = value; + delete props[ index ]; + } + + hooks = jQuery.cssHooks[ name ]; + if ( hooks && "expand" in hooks ) { + value = hooks.expand( value ); + delete props[ name ]; + + // Not quite $.extend, this won't overwrite existing keys. + // Reusing 'index' because we have the correct "name" + for ( index in value ) { + if ( !( index in props ) ) { + props[ index ] = value[ index ]; + specialEasing[ index ] = easing; + } + } + } else { + specialEasing[ name ] = easing; + } + } +} + +function Animation( elem, properties, options ) { + var result, + stopped, + index = 0, + length = Animation.prefilters.length, + deferred = jQuery.Deferred().always( function() { + + // Don't match elem in the :animated selector + delete tick.elem; + } ), + tick = function() { + if ( stopped ) { + return false; + } + var currentTime = fxNow || createFxNow(), + remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), + + // Support: Android 2.3 only + // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) + temp = remaining / animation.duration || 0, + percent = 1 - temp, + index = 0, + length = animation.tweens.length; + + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( percent ); + } + + deferred.notifyWith( elem, [ animation, percent, remaining ] ); + + // If there's more to do, yield + if ( percent < 1 && length ) { + return remaining; + } + + // If this was an empty animation, synthesize a final progress notification + if ( !length ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + } + + // Resolve the animation and report its conclusion + deferred.resolveWith( elem, [ animation ] ); + return false; + }, + animation = deferred.promise( { + elem: elem, + props: jQuery.extend( {}, properties ), + opts: jQuery.extend( true, { + specialEasing: {}, + easing: jQuery.easing._default + }, options ), + originalProperties: properties, + originalOptions: options, + startTime: fxNow || createFxNow(), + duration: options.duration, + tweens: [], + createTween: function( prop, end ) { + var tween = jQuery.Tween( elem, animation.opts, prop, end, + animation.opts.specialEasing[ prop ] || animation.opts.easing ); + animation.tweens.push( tween ); + return tween; + }, + stop: function( gotoEnd ) { + var index = 0, + + // If we are going to the end, we want to run all the tweens + // otherwise we skip this part + length = gotoEnd ? animation.tweens.length : 0; + if ( stopped ) { + return this; + } + stopped = true; + for ( ; index < length; index++ ) { + animation.tweens[ index ].run( 1 ); + } + + // Resolve when we played the last frame; otherwise, reject + if ( gotoEnd ) { + deferred.notifyWith( elem, [ animation, 1, 0 ] ); + deferred.resolveWith( elem, [ animation, gotoEnd ] ); + } else { + deferred.rejectWith( elem, [ animation, gotoEnd ] ); + } + return this; + } + } ), + props = animation.props; + + propFilter( props, animation.opts.specialEasing ); + + for ( ; index < length; index++ ) { + result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); + if ( result ) { + if ( isFunction( result.stop ) ) { + jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = + result.stop.bind( result ); + } + return result; + } + } + + jQuery.map( props, createTween, animation ); + + if ( isFunction( animation.opts.start ) ) { + animation.opts.start.call( elem, animation ); + } + + // Attach callbacks from options + animation + .progress( animation.opts.progress ) + .done( animation.opts.done, animation.opts.complete ) + .fail( animation.opts.fail ) + .always( animation.opts.always ); + + jQuery.fx.timer( + jQuery.extend( tick, { + elem: elem, + anim: animation, + queue: animation.opts.queue + } ) + ); + + return animation; +} + +jQuery.Animation = jQuery.extend( Animation, { + + tweeners: { + "*": [ function( prop, value ) { + var tween = this.createTween( prop, value ); + adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); + return tween; + } ] + }, + + tweener: function( props, callback ) { + if ( isFunction( props ) ) { + callback = props; + props = [ "*" ]; + } else { + props = props.match( rnothtmlwhite ); + } + + var prop, + index = 0, + length = props.length; + + for ( ; index < length; index++ ) { + prop = props[ index ]; + Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; + Animation.tweeners[ prop ].unshift( callback ); + } + }, + + prefilters: [ defaultPrefilter ], + + prefilter: function( callback, prepend ) { + if ( prepend ) { + Animation.prefilters.unshift( callback ); + } else { + Animation.prefilters.push( callback ); + } + } +} ); + +jQuery.speed = function( speed, easing, fn ) { + var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { + complete: fn || !fn && easing || + isFunction( speed ) && speed, + duration: speed, + easing: fn && easing || easing && !isFunction( easing ) && easing + }; + + // Go to the end state if fx are off + if ( jQuery.fx.off ) { + opt.duration = 0; + + } else { + if ( typeof opt.duration !== "number" ) { + if ( opt.duration in jQuery.fx.speeds ) { + opt.duration = jQuery.fx.speeds[ opt.duration ]; + + } else { + opt.duration = jQuery.fx.speeds._default; + } + } + } + + // Normalize opt.queue - true/undefined/null -> "fx" + if ( opt.queue == null || opt.queue === true ) { + opt.queue = "fx"; + } + + // Queueing + opt.old = opt.complete; + + opt.complete = function() { + if ( isFunction( opt.old ) ) { + opt.old.call( this ); + } + + if ( opt.queue ) { + jQuery.dequeue( this, opt.queue ); + } + }; + + return opt; +}; + +jQuery.fn.extend( { + fadeTo: function( speed, to, easing, callback ) { + + // Show any hidden elements after setting opacity to 0 + return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() + + // Animate to the value specified + .end().animate( { opacity: to }, speed, easing, callback ); + }, + animate: function( prop, speed, easing, callback ) { + var empty = jQuery.isEmptyObject( prop ), + optall = jQuery.speed( speed, easing, callback ), + doAnimation = function() { + + // Operate on a copy of prop so per-property easing won't be lost + var anim = Animation( this, jQuery.extend( {}, prop ), optall ); + + // Empty animations, or finishing resolves immediately + if ( empty || dataPriv.get( this, "finish" ) ) { + anim.stop( true ); + } + }; + doAnimation.finish = doAnimation; + + return empty || optall.queue === false ? + this.each( doAnimation ) : + this.queue( optall.queue, doAnimation ); + }, + stop: function( type, clearQueue, gotoEnd ) { + var stopQueue = function( hooks ) { + var stop = hooks.stop; + delete hooks.stop; + stop( gotoEnd ); + }; + + if ( typeof type !== "string" ) { + gotoEnd = clearQueue; + clearQueue = type; + type = undefined; + } + if ( clearQueue ) { + this.queue( type || "fx", [] ); + } + + return this.each( function() { + var dequeue = true, + index = type != null && type + "queueHooks", + timers = jQuery.timers, + data = dataPriv.get( this ); + + if ( index ) { + if ( data[ index ] && data[ index ].stop ) { + stopQueue( data[ index ] ); + } + } else { + for ( index in data ) { + if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { + stopQueue( data[ index ] ); + } + } + } + + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && + ( type == null || timers[ index ].queue === type ) ) { + + timers[ index ].anim.stop( gotoEnd ); + dequeue = false; + timers.splice( index, 1 ); + } + } + + // Start the next in the queue if the last step wasn't forced. + // Timers currently will call their complete callbacks, which + // will dequeue but only if they were gotoEnd. + if ( dequeue || !gotoEnd ) { + jQuery.dequeue( this, type ); + } + } ); + }, + finish: function( type ) { + if ( type !== false ) { + type = type || "fx"; + } + return this.each( function() { + var index, + data = dataPriv.get( this ), + queue = data[ type + "queue" ], + hooks = data[ type + "queueHooks" ], + timers = jQuery.timers, + length = queue ? queue.length : 0; + + // Enable finishing flag on private data + data.finish = true; + + // Empty the queue first + jQuery.queue( this, type, [] ); + + if ( hooks && hooks.stop ) { + hooks.stop.call( this, true ); + } + + // Look for any active animations, and finish them + for ( index = timers.length; index--; ) { + if ( timers[ index ].elem === this && timers[ index ].queue === type ) { + timers[ index ].anim.stop( true ); + timers.splice( index, 1 ); + } + } + + // Look for any animations in the old queue and finish them + for ( index = 0; index < length; index++ ) { + if ( queue[ index ] && queue[ index ].finish ) { + queue[ index ].finish.call( this ); + } + } + + // Turn off finishing flag + delete data.finish; + } ); + } +} ); + +jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { + var cssFn = jQuery.fn[ name ]; + jQuery.fn[ name ] = function( speed, easing, callback ) { + return speed == null || typeof speed === "boolean" ? + cssFn.apply( this, arguments ) : + this.animate( genFx( name, true ), speed, easing, callback ); + }; +} ); + +// Generate shortcuts for custom animations +jQuery.each( { + slideDown: genFx( "show" ), + slideUp: genFx( "hide" ), + slideToggle: genFx( "toggle" ), + fadeIn: { opacity: "show" }, + fadeOut: { opacity: "hide" }, + fadeToggle: { opacity: "toggle" } +}, function( name, props ) { + jQuery.fn[ name ] = function( speed, easing, callback ) { + return this.animate( props, speed, easing, callback ); + }; +} ); + +jQuery.timers = []; +jQuery.fx.tick = function() { + var timer, + i = 0, + timers = jQuery.timers; + + fxNow = Date.now(); + + for ( ; i < timers.length; i++ ) { + timer = timers[ i ]; + + // Run the timer and safely remove it when done (allowing for external removal) + if ( !timer() && timers[ i ] === timer ) { + timers.splice( i--, 1 ); + } + } + + if ( !timers.length ) { + jQuery.fx.stop(); + } + fxNow = undefined; +}; + +jQuery.fx.timer = function( timer ) { + jQuery.timers.push( timer ); + jQuery.fx.start(); +}; + +jQuery.fx.interval = 13; +jQuery.fx.start = function() { + if ( inProgress ) { + return; + } + + inProgress = true; + schedule(); +}; + +jQuery.fx.stop = function() { + inProgress = null; +}; + +jQuery.fx.speeds = { + slow: 600, + fast: 200, + + // Default speed + _default: 400 +}; + + +// Based off of the plugin by Clint Helfers, with permission. +// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ +jQuery.fn.delay = function( time, type ) { + time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; + type = type || "fx"; + + return this.queue( type, function( next, hooks ) { + var timeout = window.setTimeout( next, time ); + hooks.stop = function() { + window.clearTimeout( timeout ); + }; + } ); +}; + + +( function() { + var input = document.createElement( "input" ), + select = document.createElement( "select" ), + opt = select.appendChild( document.createElement( "option" ) ); + + input.type = "checkbox"; + + // Support: Android <=4.3 only + // Default value for a checkbox should be "on" + support.checkOn = input.value !== ""; + + // Support: IE <=11 only + // Must access selectedIndex to make default options select + support.optSelected = opt.selected; + + // Support: IE <=11 only + // An input loses its value after becoming a radio + input = document.createElement( "input" ); + input.value = "t"; + input.type = "radio"; + support.radioValue = input.value === "t"; +} )(); + + +var boolHook, + attrHandle = jQuery.expr.attrHandle; + +jQuery.fn.extend( { + attr: function( name, value ) { + return access( this, jQuery.attr, name, value, arguments.length > 1 ); + }, + + removeAttr: function( name ) { + return this.each( function() { + jQuery.removeAttr( this, name ); + } ); + } +} ); + +jQuery.extend( { + attr: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set attributes on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + // Fallback to prop when attributes are not supported + if ( typeof elem.getAttribute === "undefined" ) { + return jQuery.prop( elem, name, value ); + } + + // Attribute hooks are determined by the lowercase version + // Grab necessary hook if one is defined + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + hooks = jQuery.attrHooks[ name.toLowerCase() ] || + ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); + } + + if ( value !== undefined ) { + if ( value === null ) { + jQuery.removeAttr( elem, name ); + return; + } + + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + elem.setAttribute( name, value + "" ); + return value; + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + ret = jQuery.find.attr( elem, name ); + + // Non-existent attributes return null, we normalize to undefined + return ret == null ? undefined : ret; + }, + + attrHooks: { + type: { + set: function( elem, value ) { + if ( !support.radioValue && value === "radio" && + nodeName( elem, "input" ) ) { + var val = elem.value; + elem.setAttribute( "type", value ); + if ( val ) { + elem.value = val; + } + return value; + } + } + } + }, + + removeAttr: function( elem, value ) { + var name, + i = 0, + + // Attribute names can contain non-HTML whitespace characters + // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 + attrNames = value && value.match( rnothtmlwhite ); + + if ( attrNames && elem.nodeType === 1 ) { + while ( ( name = attrNames[ i++ ] ) ) { + elem.removeAttribute( name ); + } + } + } +} ); + +// Hooks for boolean attributes +boolHook = { + set: function( elem, value, name ) { + if ( value === false ) { + + // Remove boolean attributes when set to false + jQuery.removeAttr( elem, name ); + } else { + elem.setAttribute( name, name ); + } + return name; + } +}; + +jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { + var getter = attrHandle[ name ] || jQuery.find.attr; + + attrHandle[ name ] = function( elem, name, isXML ) { + var ret, handle, + lowercaseName = name.toLowerCase(); + + if ( !isXML ) { + + // Avoid an infinite loop by temporarily removing this function from the getter + handle = attrHandle[ lowercaseName ]; + attrHandle[ lowercaseName ] = ret; + ret = getter( elem, name, isXML ) != null ? + lowercaseName : + null; + attrHandle[ lowercaseName ] = handle; + } + return ret; + }; +} ); + + + + +var rfocusable = /^(?:input|select|textarea|button)$/i, + rclickable = /^(?:a|area)$/i; + +jQuery.fn.extend( { + prop: function( name, value ) { + return access( this, jQuery.prop, name, value, arguments.length > 1 ); + }, + + removeProp: function( name ) { + return this.each( function() { + delete this[ jQuery.propFix[ name ] || name ]; + } ); + } +} ); + +jQuery.extend( { + prop: function( elem, name, value ) { + var ret, hooks, + nType = elem.nodeType; + + // Don't get/set properties on text, comment and attribute nodes + if ( nType === 3 || nType === 8 || nType === 2 ) { + return; + } + + if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { + + // Fix name and attach hooks + name = jQuery.propFix[ name ] || name; + hooks = jQuery.propHooks[ name ]; + } + + if ( value !== undefined ) { + if ( hooks && "set" in hooks && + ( ret = hooks.set( elem, value, name ) ) !== undefined ) { + return ret; + } + + return ( elem[ name ] = value ); + } + + if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { + return ret; + } + + return elem[ name ]; + }, + + propHooks: { + tabIndex: { + get: function( elem ) { + + // Support: IE <=9 - 11 only + // elem.tabIndex doesn't always return the + // correct value when it hasn't been explicitly set + // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ + // Use proper attribute retrieval(#12072) + var tabindex = jQuery.find.attr( elem, "tabindex" ); + + if ( tabindex ) { + return parseInt( tabindex, 10 ); + } + + if ( + rfocusable.test( elem.nodeName ) || + rclickable.test( elem.nodeName ) && + elem.href + ) { + return 0; + } + + return -1; + } + } + }, + + propFix: { + "for": "htmlFor", + "class": "className" + } +} ); + +// Support: IE <=11 only +// Accessing the selectedIndex property +// forces the browser to respect setting selected +// on the option +// The getter ensures a default option is selected +// when in an optgroup +// eslint rule "no-unused-expressions" is disabled for this code +// since it considers such accessions noop +if ( !support.optSelected ) { + jQuery.propHooks.selected = { + get: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent && parent.parentNode ) { + parent.parentNode.selectedIndex; + } + return null; + }, + set: function( elem ) { + + /* eslint no-unused-expressions: "off" */ + + var parent = elem.parentNode; + if ( parent ) { + parent.selectedIndex; + + if ( parent.parentNode ) { + parent.parentNode.selectedIndex; + } + } + } + }; +} + +jQuery.each( [ + "tabIndex", + "readOnly", + "maxLength", + "cellSpacing", + "cellPadding", + "rowSpan", + "colSpan", + "useMap", + "frameBorder", + "contentEditable" +], function() { + jQuery.propFix[ this.toLowerCase() ] = this; +} ); + + + + + // Strip and collapse whitespace according to HTML spec + // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace + function stripAndCollapse( value ) { + var tokens = value.match( rnothtmlwhite ) || []; + return tokens.join( " " ); + } + + +function getClass( elem ) { + return elem.getAttribute && elem.getAttribute( "class" ) || ""; +} + +function classesToArray( value ) { + if ( Array.isArray( value ) ) { + return value; + } + if ( typeof value === "string" ) { + return value.match( rnothtmlwhite ) || []; + } + return []; +} + +jQuery.fn.extend( { + addClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + if ( cur.indexOf( " " + clazz + " " ) < 0 ) { + cur += clazz + " "; + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + removeClass: function( value ) { + var classes, elem, cur, curValue, clazz, j, finalValue, + i = 0; + + if ( isFunction( value ) ) { + return this.each( function( j ) { + jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); + } ); + } + + if ( !arguments.length ) { + return this.attr( "class", "" ); + } + + classes = classesToArray( value ); + + if ( classes.length ) { + while ( ( elem = this[ i++ ] ) ) { + curValue = getClass( elem ); + + // This expression is here for better compressibility (see addClass) + cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); + + if ( cur ) { + j = 0; + while ( ( clazz = classes[ j++ ] ) ) { + + // Remove *all* instances + while ( cur.indexOf( " " + clazz + " " ) > -1 ) { + cur = cur.replace( " " + clazz + " ", " " ); + } + } + + // Only assign if different to avoid unneeded rendering. + finalValue = stripAndCollapse( cur ); + if ( curValue !== finalValue ) { + elem.setAttribute( "class", finalValue ); + } + } + } + } + + return this; + }, + + toggleClass: function( value, stateVal ) { + var type = typeof value, + isValidValue = type === "string" || Array.isArray( value ); + + if ( typeof stateVal === "boolean" && isValidValue ) { + return stateVal ? this.addClass( value ) : this.removeClass( value ); + } + + if ( isFunction( value ) ) { + return this.each( function( i ) { + jQuery( this ).toggleClass( + value.call( this, i, getClass( this ), stateVal ), + stateVal + ); + } ); + } + + return this.each( function() { + var className, i, self, classNames; + + if ( isValidValue ) { + + // Toggle individual class names + i = 0; + self = jQuery( this ); + classNames = classesToArray( value ); + + while ( ( className = classNames[ i++ ] ) ) { + + // Check each className given, space separated list + if ( self.hasClass( className ) ) { + self.removeClass( className ); + } else { + self.addClass( className ); + } + } + + // Toggle whole class name + } else if ( value === undefined || type === "boolean" ) { + className = getClass( this ); + if ( className ) { + + // Store className if set + dataPriv.set( this, "__className__", className ); + } + + // If the element has a class name or if we're passed `false`, + // then remove the whole classname (if there was one, the above saved it). + // Otherwise bring back whatever was previously saved (if anything), + // falling back to the empty string if nothing was stored. + if ( this.setAttribute ) { + this.setAttribute( "class", + className || value === false ? + "" : + dataPriv.get( this, "__className__" ) || "" + ); + } + } + } ); + }, + + hasClass: function( selector ) { + var className, elem, + i = 0; + + className = " " + selector + " "; + while ( ( elem = this[ i++ ] ) ) { + if ( elem.nodeType === 1 && + ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { + return true; + } + } + + return false; + } +} ); + + + + +var rreturn = /\r/g; + +jQuery.fn.extend( { + val: function( value ) { + var hooks, ret, valueIsFunction, + elem = this[ 0 ]; + + if ( !arguments.length ) { + if ( elem ) { + hooks = jQuery.valHooks[ elem.type ] || + jQuery.valHooks[ elem.nodeName.toLowerCase() ]; + + if ( hooks && + "get" in hooks && + ( ret = hooks.get( elem, "value" ) ) !== undefined + ) { + return ret; + } + + ret = elem.value; + + // Handle most common string cases + if ( typeof ret === "string" ) { + return ret.replace( rreturn, "" ); + } + + // Handle cases where value is null/undef or number + return ret == null ? "" : ret; + } + + return; + } + + valueIsFunction = isFunction( value ); + + return this.each( function( i ) { + var val; + + if ( this.nodeType !== 1 ) { + return; + } + + if ( valueIsFunction ) { + val = value.call( this, i, jQuery( this ).val() ); + } else { + val = value; + } + + // Treat null/undefined as ""; convert numbers to string + if ( val == null ) { + val = ""; + + } else if ( typeof val === "number" ) { + val += ""; + + } else if ( Array.isArray( val ) ) { + val = jQuery.map( val, function( value ) { + return value == null ? "" : value + ""; + } ); + } + + hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; + + // If set returns undefined, fall back to normal setting + if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { + this.value = val; + } + } ); + } +} ); + +jQuery.extend( { + valHooks: { + option: { + get: function( elem ) { + + var val = jQuery.find.attr( elem, "value" ); + return val != null ? + val : + + // Support: IE <=10 - 11 only + // option.text throws exceptions (#14686, #14858) + // Strip and collapse whitespace + // https://html.spec.whatwg.org/#strip-and-collapse-whitespace + stripAndCollapse( jQuery.text( elem ) ); + } + }, + select: { + get: function( elem ) { + var value, option, i, + options = elem.options, + index = elem.selectedIndex, + one = elem.type === "select-one", + values = one ? null : [], + max = one ? index + 1 : options.length; + + if ( index < 0 ) { + i = max; + + } else { + i = one ? index : 0; + } + + // Loop through all the selected options + for ( ; i < max; i++ ) { + option = options[ i ]; + + // Support: IE <=9 only + // IE8-9 doesn't update selected after form reset (#2551) + if ( ( option.selected || i === index ) && + + // Don't return options that are disabled or in a disabled optgroup + !option.disabled && + ( !option.parentNode.disabled || + !nodeName( option.parentNode, "optgroup" ) ) ) { + + // Get the specific value for the option + value = jQuery( option ).val(); + + // We don't need an array for one selects + if ( one ) { + return value; + } + + // Multi-Selects return an array + values.push( value ); + } + } + + return values; + }, + + set: function( elem, value ) { + var optionSet, option, + options = elem.options, + values = jQuery.makeArray( value ), + i = options.length; + + while ( i-- ) { + option = options[ i ]; + + /* eslint-disable no-cond-assign */ + + if ( option.selected = + jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 + ) { + optionSet = true; + } + + /* eslint-enable no-cond-assign */ + } + + // Force browsers to behave consistently when non-matching value is set + if ( !optionSet ) { + elem.selectedIndex = -1; + } + return values; + } + } + } +} ); + +// Radios and checkboxes getter/setter +jQuery.each( [ "radio", "checkbox" ], function() { + jQuery.valHooks[ this ] = { + set: function( elem, value ) { + if ( Array.isArray( value ) ) { + return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); + } + } + }; + if ( !support.checkOn ) { + jQuery.valHooks[ this ].get = function( elem ) { + return elem.getAttribute( "value" ) === null ? "on" : elem.value; + }; + } +} ); + + + + +// Return jQuery for attributes-only inclusion + + +support.focusin = "onfocusin" in window; + + +var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, + stopPropagationCallback = function( e ) { + e.stopPropagation(); + }; + +jQuery.extend( jQuery.event, { + + trigger: function( event, data, elem, onlyHandlers ) { + + var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, + eventPath = [ elem || document ], + type = hasOwn.call( event, "type" ) ? event.type : event, + namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; + + cur = lastElement = tmp = elem = elem || document; + + // Don't do events on text and comment nodes + if ( elem.nodeType === 3 || elem.nodeType === 8 ) { + return; + } + + // focus/blur morphs to focusin/out; ensure we're not firing them right now + if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { + return; + } + + if ( type.indexOf( "." ) > -1 ) { + + // Namespaced trigger; create a regexp to match event type in handle() + namespaces = type.split( "." ); + type = namespaces.shift(); + namespaces.sort(); + } + ontype = type.indexOf( ":" ) < 0 && "on" + type; + + // Caller can pass in a jQuery.Event object, Object, or just an event type string + event = event[ jQuery.expando ] ? + event : + new jQuery.Event( type, typeof event === "object" && event ); + + // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) + event.isTrigger = onlyHandlers ? 2 : 3; + event.namespace = namespaces.join( "." ); + event.rnamespace = event.namespace ? + new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : + null; + + // Clean up the event in case it is being reused + event.result = undefined; + if ( !event.target ) { + event.target = elem; + } + + // Clone any incoming data and prepend the event, creating the handler arg list + data = data == null ? + [ event ] : + jQuery.makeArray( data, [ event ] ); + + // Allow special events to draw outside the lines + special = jQuery.event.special[ type ] || {}; + if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { + return; + } + + // Determine event propagation path in advance, per W3C events spec (#9951) + // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) + if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { + + bubbleType = special.delegateType || type; + if ( !rfocusMorph.test( bubbleType + type ) ) { + cur = cur.parentNode; + } + for ( ; cur; cur = cur.parentNode ) { + eventPath.push( cur ); + tmp = cur; + } + + // Only add window if we got to document (e.g., not plain obj or detached DOM) + if ( tmp === ( elem.ownerDocument || document ) ) { + eventPath.push( tmp.defaultView || tmp.parentWindow || window ); + } + } + + // Fire handlers on the event path + i = 0; + while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { + lastElement = cur; + event.type = i > 1 ? + bubbleType : + special.bindType || type; + + // jQuery handler + handle = ( + dataPriv.get( cur, "events" ) || Object.create( null ) + )[ event.type ] && + dataPriv.get( cur, "handle" ); + if ( handle ) { + handle.apply( cur, data ); + } + + // Native handler + handle = ontype && cur[ ontype ]; + if ( handle && handle.apply && acceptData( cur ) ) { + event.result = handle.apply( cur, data ); + if ( event.result === false ) { + event.preventDefault(); + } + } + } + event.type = type; + + // If nobody prevented the default action, do it now + if ( !onlyHandlers && !event.isDefaultPrevented() ) { + + if ( ( !special._default || + special._default.apply( eventPath.pop(), data ) === false ) && + acceptData( elem ) ) { + + // Call a native DOM method on the target with the same name as the event. + // Don't do default actions on window, that's where global variables be (#6170) + if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { + + // Don't re-trigger an onFOO event when we call its FOO() method + tmp = elem[ ontype ]; + + if ( tmp ) { + elem[ ontype ] = null; + } + + // Prevent re-triggering of the same event, since we already bubbled it above + jQuery.event.triggered = type; + + if ( event.isPropagationStopped() ) { + lastElement.addEventListener( type, stopPropagationCallback ); + } + + elem[ type ](); + + if ( event.isPropagationStopped() ) { + lastElement.removeEventListener( type, stopPropagationCallback ); + } + + jQuery.event.triggered = undefined; + + if ( tmp ) { + elem[ ontype ] = tmp; + } + } + } + } + + return event.result; + }, + + // Piggyback on a donor event to simulate a different one + // Used only for `focus(in | out)` events + simulate: function( type, elem, event ) { + var e = jQuery.extend( + new jQuery.Event(), + event, + { + type: type, + isSimulated: true + } + ); + + jQuery.event.trigger( e, null, elem ); + } + +} ); + +jQuery.fn.extend( { + + trigger: function( type, data ) { + return this.each( function() { + jQuery.event.trigger( type, data, this ); + } ); + }, + triggerHandler: function( type, data ) { + var elem = this[ 0 ]; + if ( elem ) { + return jQuery.event.trigger( type, data, elem, true ); + } + } +} ); + + +// Support: Firefox <=44 +// Firefox doesn't have focus(in | out) events +// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 +// +// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 +// focus(in | out) events fire after focus & blur events, +// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order +// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 +if ( !support.focusin ) { + jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { + + // Attach a single capturing handler on the document while someone wants focusin/focusout + var handler = function( event ) { + jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); + }; + + jQuery.event.special[ fix ] = { + setup: function() { + + // Handle: regular nodes (via `this.ownerDocument`), window + // (via `this.document`) & document (via `this`). + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ); + + if ( !attaches ) { + doc.addEventListener( orig, handler, true ); + } + dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); + }, + teardown: function() { + var doc = this.ownerDocument || this.document || this, + attaches = dataPriv.access( doc, fix ) - 1; + + if ( !attaches ) { + doc.removeEventListener( orig, handler, true ); + dataPriv.remove( doc, fix ); + + } else { + dataPriv.access( doc, fix, attaches ); + } + } + }; + } ); +} +var location = window.location; + +var nonce = { guid: Date.now() }; + +var rquery = ( /\?/ ); + + + +// Cross-browser xml parsing +jQuery.parseXML = function( data ) { + var xml; + if ( !data || typeof data !== "string" ) { + return null; + } + + // Support: IE 9 - 11 only + // IE throws on parseFromString with invalid input. + try { + xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); + } catch ( e ) { + xml = undefined; + } + + if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { + jQuery.error( "Invalid XML: " + data ); + } + return xml; +}; + + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = isFunction( valueOrFunction ) ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ) + .filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ) + .map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + + +var + r20 = /%20/g, + rhash = /#.*$/, + rantiCache = /([?&])_=[^&]*/, + rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, + + // #7653, #8125, #8152: local protocol detection + rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, + rnoContent = /^(?:GET|HEAD)$/, + rprotocol = /^\/\//, + + /* Prefilters + * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) + * 2) These are called: + * - BEFORE asking for a transport + * - AFTER param serialization (s.data is a string if s.processData is true) + * 3) key is the dataType + * 4) the catchall symbol "*" can be used + * 5) execution will start with transport dataType and THEN continue down to "*" if needed + */ + prefilters = {}, + + /* Transports bindings + * 1) key is the dataType + * 2) the catchall symbol "*" can be used + * 3) selection will start with transport dataType and THEN go to "*" if needed + */ + transports = {}, + + // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression + allTypes = "*/".concat( "*" ), + + // Anchor tag for parsing the document origin + originAnchor = document.createElement( "a" ); + originAnchor.href = location.href; + +// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport +function addToPrefiltersOrTransports( structure ) { + + // dataTypeExpression is optional and defaults to "*" + return function( dataTypeExpression, func ) { + + if ( typeof dataTypeExpression !== "string" ) { + func = dataTypeExpression; + dataTypeExpression = "*"; + } + + var dataType, + i = 0, + dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; + + if ( isFunction( func ) ) { + + // For each dataType in the dataTypeExpression + while ( ( dataType = dataTypes[ i++ ] ) ) { + + // Prepend if requested + if ( dataType[ 0 ] === "+" ) { + dataType = dataType.slice( 1 ) || "*"; + ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); + + // Otherwise append + } else { + ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); + } + } + } + }; +} + +// Base inspection function for prefilters and transports +function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { + + var inspected = {}, + seekingTransport = ( structure === transports ); + + function inspect( dataType ) { + var selected; + inspected[ dataType ] = true; + jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { + var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); + if ( typeof dataTypeOrTransport === "string" && + !seekingTransport && !inspected[ dataTypeOrTransport ] ) { + + options.dataTypes.unshift( dataTypeOrTransport ); + inspect( dataTypeOrTransport ); + return false; + } else if ( seekingTransport ) { + return !( selected = dataTypeOrTransport ); + } + } ); + return selected; + } + + return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); +} + +// A special extend for ajax options +// that takes "flat" options (not to be deep extended) +// Fixes #9887 +function ajaxExtend( target, src ) { + var key, deep, + flatOptions = jQuery.ajaxSettings.flatOptions || {}; + + for ( key in src ) { + if ( src[ key ] !== undefined ) { + ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; + } + } + if ( deep ) { + jQuery.extend( true, target, deep ); + } + + return target; +} + +/* Handles responses to an ajax request: + * - finds the right dataType (mediates between content-type and expected dataType) + * - returns the corresponding response + */ +function ajaxHandleResponses( s, jqXHR, responses ) { + + var ct, type, finalDataType, firstDataType, + contents = s.contents, + dataTypes = s.dataTypes; + + // Remove auto dataType and get content-type in the process + while ( dataTypes[ 0 ] === "*" ) { + dataTypes.shift(); + if ( ct === undefined ) { + ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); + } + } + + // Check if we're dealing with a known content-type + if ( ct ) { + for ( type in contents ) { + if ( contents[ type ] && contents[ type ].test( ct ) ) { + dataTypes.unshift( type ); + break; + } + } + } + + // Check to see if we have a response for the expected dataType + if ( dataTypes[ 0 ] in responses ) { + finalDataType = dataTypes[ 0 ]; + } else { + + // Try convertible dataTypes + for ( type in responses ) { + if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { + finalDataType = type; + break; + } + if ( !firstDataType ) { + firstDataType = type; + } + } + + // Or just use first one + finalDataType = finalDataType || firstDataType; + } + + // If we found a dataType + // We add the dataType to the list if needed + // and return the corresponding response + if ( finalDataType ) { + if ( finalDataType !== dataTypes[ 0 ] ) { + dataTypes.unshift( finalDataType ); + } + return responses[ finalDataType ]; + } +} + +/* Chain conversions given the request and the original response + * Also sets the responseXXX fields on the jqXHR instance + */ +function ajaxConvert( s, response, jqXHR, isSuccess ) { + var conv2, current, conv, tmp, prev, + converters = {}, + + // Work with a copy of dataTypes in case we need to modify it for conversion + dataTypes = s.dataTypes.slice(); + + // Create converters map with lowercased keys + if ( dataTypes[ 1 ] ) { + for ( conv in s.converters ) { + converters[ conv.toLowerCase() ] = s.converters[ conv ]; + } + } + + current = dataTypes.shift(); + + // Convert to each sequential dataType + while ( current ) { + + if ( s.responseFields[ current ] ) { + jqXHR[ s.responseFields[ current ] ] = response; + } + + // Apply the dataFilter if provided + if ( !prev && isSuccess && s.dataFilter ) { + response = s.dataFilter( response, s.dataType ); + } + + prev = current; + current = dataTypes.shift(); + + if ( current ) { + + // There's only work to do if current dataType is non-auto + if ( current === "*" ) { + + current = prev; + + // Convert response if prev dataType is non-auto and differs from current + } else if ( prev !== "*" && prev !== current ) { + + // Seek a direct converter + conv = converters[ prev + " " + current ] || converters[ "* " + current ]; + + // If none found, seek a pair + if ( !conv ) { + for ( conv2 in converters ) { + + // If conv2 outputs current + tmp = conv2.split( " " ); + if ( tmp[ 1 ] === current ) { + + // If prev can be converted to accepted input + conv = converters[ prev + " " + tmp[ 0 ] ] || + converters[ "* " + tmp[ 0 ] ]; + if ( conv ) { + + // Condense equivalence converters + if ( conv === true ) { + conv = converters[ conv2 ]; + + // Otherwise, insert the intermediate dataType + } else if ( converters[ conv2 ] !== true ) { + current = tmp[ 0 ]; + dataTypes.unshift( tmp[ 1 ] ); + } + break; + } + } + } + } + + // Apply converter (if not an equivalence) + if ( conv !== true ) { + + // Unless errors are allowed to bubble, catch and return them + if ( conv && s.throws ) { + response = conv( response ); + } else { + try { + response = conv( response ); + } catch ( e ) { + return { + state: "parsererror", + error: conv ? e : "No conversion from " + prev + " to " + current + }; + } + } + } + } + } + } + + return { state: "success", data: response }; +} + +jQuery.extend( { + + // Counter for holding the number of active queries + active: 0, + + // Last-Modified header cache for next request + lastModified: {}, + etag: {}, + + ajaxSettings: { + url: location.href, + type: "GET", + isLocal: rlocalProtocol.test( location.protocol ), + global: true, + processData: true, + async: true, + contentType: "application/x-www-form-urlencoded; charset=UTF-8", + + /* + timeout: 0, + data: null, + dataType: null, + username: null, + password: null, + cache: null, + throws: false, + traditional: false, + headers: {}, + */ + + accepts: { + "*": allTypes, + text: "text/plain", + html: "text/html", + xml: "application/xml, text/xml", + json: "application/json, text/javascript" + }, + + contents: { + xml: /\bxml\b/, + html: /\bhtml/, + json: /\bjson\b/ + }, + + responseFields: { + xml: "responseXML", + text: "responseText", + json: "responseJSON" + }, + + // Data converters + // Keys separate source (or catchall "*") and destination types with a single space + converters: { + + // Convert anything to text + "* text": String, + + // Text to html (true = no transformation) + "text html": true, + + // Evaluate text as a json expression + "text json": JSON.parse, + + // Parse text as xml + "text xml": jQuery.parseXML + }, + + // For options that shouldn't be deep extended: + // you can add your own custom options here if + // and when you create one that shouldn't be + // deep extended (see ajaxExtend) + flatOptions: { + url: true, + context: true + } + }, + + // Creates a full fledged settings object into target + // with both ajaxSettings and settings fields. + // If target is omitted, writes into ajaxSettings. + ajaxSetup: function( target, settings ) { + return settings ? + + // Building a settings object + ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : + + // Extending ajaxSettings + ajaxExtend( jQuery.ajaxSettings, target ); + }, + + ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), + ajaxTransport: addToPrefiltersOrTransports( transports ), + + // Main method + ajax: function( url, options ) { + + // If url is an object, simulate pre-1.5 signature + if ( typeof url === "object" ) { + options = url; + url = undefined; + } + + // Force options to be an object + options = options || {}; + + var transport, + + // URL without anti-cache param + cacheURL, + + // Response headers + responseHeadersString, + responseHeaders, + + // timeout handle + timeoutTimer, + + // Url cleanup var + urlAnchor, + + // Request state (becomes false upon send and true upon completion) + completed, + + // To know if global events are to be dispatched + fireGlobals, + + // Loop variable + i, + + // uncached part of the url + uncached, + + // Create the final options object + s = jQuery.ajaxSetup( {}, options ), + + // Callbacks context + callbackContext = s.context || s, + + // Context for global events is callbackContext if it is a DOM node or jQuery collection + globalEventContext = s.context && + ( callbackContext.nodeType || callbackContext.jquery ) ? + jQuery( callbackContext ) : + jQuery.event, + + // Deferreds + deferred = jQuery.Deferred(), + completeDeferred = jQuery.Callbacks( "once memory" ), + + // Status-dependent callbacks + statusCode = s.statusCode || {}, + + // Headers (they are sent all at once) + requestHeaders = {}, + requestHeadersNames = {}, + + // Default abort message + strAbort = "canceled", + + // Fake xhr + jqXHR = { + readyState: 0, + + // Builds headers hashtable if needed + getResponseHeader: function( key ) { + var match; + if ( completed ) { + if ( !responseHeaders ) { + responseHeaders = {}; + while ( ( match = rheaders.exec( responseHeadersString ) ) ) { + responseHeaders[ match[ 1 ].toLowerCase() + " " ] = + ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) + .concat( match[ 2 ] ); + } + } + match = responseHeaders[ key.toLowerCase() + " " ]; + } + return match == null ? null : match.join( ", " ); + }, + + // Raw string + getAllResponseHeaders: function() { + return completed ? responseHeadersString : null; + }, + + // Caches the header + setRequestHeader: function( name, value ) { + if ( completed == null ) { + name = requestHeadersNames[ name.toLowerCase() ] = + requestHeadersNames[ name.toLowerCase() ] || name; + requestHeaders[ name ] = value; + } + return this; + }, + + // Overrides response content-type header + overrideMimeType: function( type ) { + if ( completed == null ) { + s.mimeType = type; + } + return this; + }, + + // Status-dependent callbacks + statusCode: function( map ) { + var code; + if ( map ) { + if ( completed ) { + + // Execute the appropriate callbacks + jqXHR.always( map[ jqXHR.status ] ); + } else { + + // Lazy-add the new callbacks in a way that preserves old ones + for ( code in map ) { + statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; + } + } + } + return this; + }, + + // Cancel the request + abort: function( statusText ) { + var finalText = statusText || strAbort; + if ( transport ) { + transport.abort( finalText ); + } + done( 0, finalText ); + return this; + } + }; + + // Attach deferreds + deferred.promise( jqXHR ); + + // Add protocol if not provided (prefilters might expect it) + // Handle falsy url in the settings object (#10093: consistency with old signature) + // We also use the url parameter if available + s.url = ( ( url || s.url || location.href ) + "" ) + .replace( rprotocol, location.protocol + "//" ); + + // Alias method option to type as per ticket #12004 + s.type = options.method || options.type || s.method || s.type; + + // Extract dataTypes list + s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; + + // A cross-domain request is in order when the origin doesn't match the current origin. + if ( s.crossDomain == null ) { + urlAnchor = document.createElement( "a" ); + + // Support: IE <=8 - 11, Edge 12 - 15 + // IE throws exception on accessing the href property if url is malformed, + // e.g. http://example.com:80x/ + try { + urlAnchor.href = s.url; + + // Support: IE <=8 - 11 only + // Anchor's host property isn't correctly set when s.url is relative + urlAnchor.href = urlAnchor.href; + s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== + urlAnchor.protocol + "//" + urlAnchor.host; + } catch ( e ) { + + // If there is an error parsing the URL, assume it is crossDomain, + // it can be rejected by the transport if it is invalid + s.crossDomain = true; + } + } + + // Convert data if not already a string + if ( s.data && s.processData && typeof s.data !== "string" ) { + s.data = jQuery.param( s.data, s.traditional ); + } + + // Apply prefilters + inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); + + // If request was aborted inside a prefilter, stop there + if ( completed ) { + return jqXHR; + } + + // We can fire global events as of now if asked to + // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) + fireGlobals = jQuery.event && s.global; + + // Watch for a new set of requests + if ( fireGlobals && jQuery.active++ === 0 ) { + jQuery.event.trigger( "ajaxStart" ); + } + + // Uppercase the type + s.type = s.type.toUpperCase(); + + // Determine if request has content + s.hasContent = !rnoContent.test( s.type ); + + // Save the URL in case we're toying with the If-Modified-Since + // and/or If-None-Match header later on + // Remove hash to simplify url manipulation + cacheURL = s.url.replace( rhash, "" ); + + // More options handling for requests with no content + if ( !s.hasContent ) { + + // Remember the hash so we can put it back + uncached = s.url.slice( cacheURL.length ); + + // If data is available and should be processed, append data to url + if ( s.data && ( s.processData || typeof s.data === "string" ) ) { + cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; + + // #9682: remove data so that it's not used in an eventual retry + delete s.data; + } + + // Add or update anti-cache param if needed + if ( s.cache === false ) { + cacheURL = cacheURL.replace( rantiCache, "$1" ); + uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + + uncached; + } + + // Put hash and anti-cache on the URL that will be requested (gh-1732) + s.url = cacheURL + uncached; + + // Change '%20' to '+' if this is encoded form body content (gh-2658) + } else if ( s.data && s.processData && + ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { + s.data = s.data.replace( r20, "+" ); + } + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + if ( jQuery.lastModified[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); + } + if ( jQuery.etag[ cacheURL ] ) { + jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); + } + } + + // Set the correct header, if data is being sent + if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { + jqXHR.setRequestHeader( "Content-Type", s.contentType ); + } + + // Set the Accepts header for the server, depending on the dataType + jqXHR.setRequestHeader( + "Accept", + s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? + s.accepts[ s.dataTypes[ 0 ] ] + + ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : + s.accepts[ "*" ] + ); + + // Check for headers option + for ( i in s.headers ) { + jqXHR.setRequestHeader( i, s.headers[ i ] ); + } + + // Allow custom headers/mimetypes and early abort + if ( s.beforeSend && + ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { + + // Abort if not done already and return + return jqXHR.abort(); + } + + // Aborting is no longer a cancellation + strAbort = "abort"; + + // Install callbacks on deferreds + completeDeferred.add( s.complete ); + jqXHR.done( s.success ); + jqXHR.fail( s.error ); + + // Get transport + transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); + + // If no transport, we auto-abort + if ( !transport ) { + done( -1, "No Transport" ); + } else { + jqXHR.readyState = 1; + + // Send global event + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); + } + + // If request was aborted inside ajaxSend, stop there + if ( completed ) { + return jqXHR; + } + + // Timeout + if ( s.async && s.timeout > 0 ) { + timeoutTimer = window.setTimeout( function() { + jqXHR.abort( "timeout" ); + }, s.timeout ); + } + + try { + completed = false; + transport.send( requestHeaders, done ); + } catch ( e ) { + + // Rethrow post-completion exceptions + if ( completed ) { + throw e; + } + + // Propagate others as results + done( -1, e ); + } + } + + // Callback for when everything is done + function done( status, nativeStatusText, responses, headers ) { + var isSuccess, success, error, response, modified, + statusText = nativeStatusText; + + // Ignore repeat invocations + if ( completed ) { + return; + } + + completed = true; + + // Clear timeout if it exists + if ( timeoutTimer ) { + window.clearTimeout( timeoutTimer ); + } + + // Dereference transport for early garbage collection + // (no matter how long the jqXHR object will be used) + transport = undefined; + + // Cache response headers + responseHeadersString = headers || ""; + + // Set readyState + jqXHR.readyState = status > 0 ? 4 : 0; + + // Determine if successful + isSuccess = status >= 200 && status < 300 || status === 304; + + // Get response data + if ( responses ) { + response = ajaxHandleResponses( s, jqXHR, responses ); + } + + // Use a noop converter for missing script + if ( !isSuccess && jQuery.inArray( "script", s.dataTypes ) > -1 ) { + s.converters[ "text script" ] = function() {}; + } + + // Convert no matter what (that way responseXXX fields are always set) + response = ajaxConvert( s, response, jqXHR, isSuccess ); + + // If successful, handle type chaining + if ( isSuccess ) { + + // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. + if ( s.ifModified ) { + modified = jqXHR.getResponseHeader( "Last-Modified" ); + if ( modified ) { + jQuery.lastModified[ cacheURL ] = modified; + } + modified = jqXHR.getResponseHeader( "etag" ); + if ( modified ) { + jQuery.etag[ cacheURL ] = modified; + } + } + + // if no content + if ( status === 204 || s.type === "HEAD" ) { + statusText = "nocontent"; + + // if not modified + } else if ( status === 304 ) { + statusText = "notmodified"; + + // If we have data, let's convert it + } else { + statusText = response.state; + success = response.data; + error = response.error; + isSuccess = !error; + } + } else { + + // Extract error from statusText and normalize for non-aborts + error = statusText; + if ( status || !statusText ) { + statusText = "error"; + if ( status < 0 ) { + status = 0; + } + } + } + + // Set data for the fake xhr object + jqXHR.status = status; + jqXHR.statusText = ( nativeStatusText || statusText ) + ""; + + // Success/Error + if ( isSuccess ) { + deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); + } else { + deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); + } + + // Status-dependent callbacks + jqXHR.statusCode( statusCode ); + statusCode = undefined; + + if ( fireGlobals ) { + globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", + [ jqXHR, s, isSuccess ? success : error ] ); + } + + // Complete + completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); + + if ( fireGlobals ) { + globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); + + // Handle the global AJAX counter + if ( !( --jQuery.active ) ) { + jQuery.event.trigger( "ajaxStop" ); + } + } + } + + return jqXHR; + }, + + getJSON: function( url, data, callback ) { + return jQuery.get( url, data, callback, "json" ); + }, + + getScript: function( url, callback ) { + return jQuery.get( url, undefined, callback, "script" ); + } +} ); + +jQuery.each( [ "get", "post" ], function( _i, method ) { + jQuery[ method ] = function( url, data, callback, type ) { + + // Shift arguments if data argument was omitted + if ( isFunction( data ) ) { + type = type || callback; + callback = data; + data = undefined; + } + + // The url can be an options object (which then must have .url) + return jQuery.ajax( jQuery.extend( { + url: url, + type: method, + dataType: type, + data: data, + success: callback + }, jQuery.isPlainObject( url ) && url ) ); + }; +} ); + +jQuery.ajaxPrefilter( function( s ) { + var i; + for ( i in s.headers ) { + if ( i.toLowerCase() === "content-type" ) { + s.contentType = s.headers[ i ] || ""; + } + } +} ); + + +jQuery._evalUrl = function( url, options, doc ) { + return jQuery.ajax( { + url: url, + + // Make this explicit, since user can override this through ajaxSetup (#11264) + type: "GET", + dataType: "script", + cache: true, + async: false, + global: false, + + // Only evaluate the response if it is successful (gh-4126) + // dataFilter is not invoked for failure responses, so using it instead + // of the default converter is kludgy but it works. + converters: { + "text script": function() {} + }, + dataFilter: function( response ) { + jQuery.globalEval( response, options, doc ); + } + } ); +}; + + +jQuery.fn.extend( { + wrapAll: function( html ) { + var wrap; + + if ( this[ 0 ] ) { + if ( isFunction( html ) ) { + html = html.call( this[ 0 ] ); + } + + // The elements to wrap the target around + wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); + + if ( this[ 0 ].parentNode ) { + wrap.insertBefore( this[ 0 ] ); + } + + wrap.map( function() { + var elem = this; + + while ( elem.firstElementChild ) { + elem = elem.firstElementChild; + } + + return elem; + } ).append( this ); + } + + return this; + }, + + wrapInner: function( html ) { + if ( isFunction( html ) ) { + return this.each( function( i ) { + jQuery( this ).wrapInner( html.call( this, i ) ); + } ); + } + + return this.each( function() { + var self = jQuery( this ), + contents = self.contents(); + + if ( contents.length ) { + contents.wrapAll( html ); + + } else { + self.append( html ); + } + } ); + }, + + wrap: function( html ) { + var htmlIsFunction = isFunction( html ); + + return this.each( function( i ) { + jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); + } ); + }, + + unwrap: function( selector ) { + this.parent( selector ).not( "body" ).each( function() { + jQuery( this ).replaceWith( this.childNodes ); + } ); + return this; + } +} ); + + +jQuery.expr.pseudos.hidden = function( elem ) { + return !jQuery.expr.pseudos.visible( elem ); +}; +jQuery.expr.pseudos.visible = function( elem ) { + return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); +}; + + + + +jQuery.ajaxSettings.xhr = function() { + try { + return new window.XMLHttpRequest(); + } catch ( e ) {} +}; + +var xhrSuccessStatus = { + + // File protocol always yields status code 0, assume 200 + 0: 200, + + // Support: IE <=9 only + // #1450: sometimes IE returns 1223 when it should be 204 + 1223: 204 + }, + xhrSupported = jQuery.ajaxSettings.xhr(); + +support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); +support.ajax = xhrSupported = !!xhrSupported; + +jQuery.ajaxTransport( function( options ) { + var callback, errorCallback; + + // Cross domain only allowed if supported through XMLHttpRequest + if ( support.cors || xhrSupported && !options.crossDomain ) { + return { + send: function( headers, complete ) { + var i, + xhr = options.xhr(); + + xhr.open( + options.type, + options.url, + options.async, + options.username, + options.password + ); + + // Apply custom fields if provided + if ( options.xhrFields ) { + for ( i in options.xhrFields ) { + xhr[ i ] = options.xhrFields[ i ]; + } + } + + // Override mime type if needed + if ( options.mimeType && xhr.overrideMimeType ) { + xhr.overrideMimeType( options.mimeType ); + } + + // X-Requested-With header + // For cross-domain requests, seeing as conditions for a preflight are + // akin to a jigsaw puzzle, we simply never set it to be sure. + // (it can always be set on a per-request basis or even using ajaxSetup) + // For same-domain requests, won't change header if already provided. + if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { + headers[ "X-Requested-With" ] = "XMLHttpRequest"; + } + + // Set headers + for ( i in headers ) { + xhr.setRequestHeader( i, headers[ i ] ); + } + + // Callback + callback = function( type ) { + return function() { + if ( callback ) { + callback = errorCallback = xhr.onload = + xhr.onerror = xhr.onabort = xhr.ontimeout = + xhr.onreadystatechange = null; + + if ( type === "abort" ) { + xhr.abort(); + } else if ( type === "error" ) { + + // Support: IE <=9 only + // On a manual native abort, IE9 throws + // errors on any property access that is not readyState + if ( typeof xhr.status !== "number" ) { + complete( 0, "error" ); + } else { + complete( + + // File: protocol always yields status 0; see #8605, #14207 + xhr.status, + xhr.statusText + ); + } + } else { + complete( + xhrSuccessStatus[ xhr.status ] || xhr.status, + xhr.statusText, + + // Support: IE <=9 only + // IE9 has no XHR2 but throws on binary (trac-11426) + // For XHR2 non-text, let the caller handle it (gh-2498) + ( xhr.responseType || "text" ) !== "text" || + typeof xhr.responseText !== "string" ? + { binary: xhr.response } : + { text: xhr.responseText }, + xhr.getAllResponseHeaders() + ); + } + } + }; + }; + + // Listen to events + xhr.onload = callback(); + errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); + + // Support: IE 9 only + // Use onreadystatechange to replace onabort + // to handle uncaught aborts + if ( xhr.onabort !== undefined ) { + xhr.onabort = errorCallback; + } else { + xhr.onreadystatechange = function() { + + // Check readyState before timeout as it changes + if ( xhr.readyState === 4 ) { + + // Allow onerror to be called first, + // but that will not handle a native abort + // Also, save errorCallback to a variable + // as xhr.onerror cannot be accessed + window.setTimeout( function() { + if ( callback ) { + errorCallback(); + } + } ); + } + }; + } + + // Create the abort callback + callback = callback( "abort" ); + + try { + + // Do send the request (this may raise an exception) + xhr.send( options.hasContent && options.data || null ); + } catch ( e ) { + + // #14683: Only rethrow if this hasn't been notified as an error yet + if ( callback ) { + throw e; + } + } + }, + + abort: function() { + if ( callback ) { + callback(); + } + } + }; + } +} ); + + + + +// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) +jQuery.ajaxPrefilter( function( s ) { + if ( s.crossDomain ) { + s.contents.script = false; + } +} ); + +// Install script dataType +jQuery.ajaxSetup( { + accepts: { + script: "text/javascript, application/javascript, " + + "application/ecmascript, application/x-ecmascript" + }, + contents: { + script: /\b(?:java|ecma)script\b/ + }, + converters: { + "text script": function( text ) { + jQuery.globalEval( text ); + return text; + } + } +} ); + +// Handle cache's special case and crossDomain +jQuery.ajaxPrefilter( "script", function( s ) { + if ( s.cache === undefined ) { + s.cache = false; + } + if ( s.crossDomain ) { + s.type = "GET"; + } +} ); + +// Bind script tag hack transport +jQuery.ajaxTransport( "script", function( s ) { + + // This transport only deals with cross domain or forced-by-attrs requests + if ( s.crossDomain || s.scriptAttrs ) { + var script, callback; + return { + send: function( _, complete ) { + script = jQuery( " + + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/add_new_model_to_data_labeler.ipynb b/docs/0.7.1/html/add_new_model_to_data_labeler.ipynb new file mode 100644 index 000000000..3f59297bc --- /dev/null +++ b/docs/0.7.1/html/add_new_model_to_data_labeler.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "228bb2a6", + "metadata": {}, + "source": [ + "# Adding new model to the existing DataLabeler pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "cab7a569", + "metadata": {}, + "source": [ + "Consider the case when we would like to explore different character-level neural network models and evaluate their performance on different datasets. The existing DataLabeler in the DataProfiler library already contains a preprocessor, a postprocessor, and a character-level CNN (Convolutional Neural Network) model that are combined to work on such data. All we need is to build additional model classes that inherit the main functionalities from the CNN model and also adapt the model construction to the desired architectures. In this example, we define such a new model to be used with the Data Labeler component of the Data Profiler. In particular, a character-level LSTM (Long Short-Term Memory) model is implemented, then integrated into the DataLabeler pipeline to be trained with a tabular dataset. The process includes the following steps:\n", + "\n", + " - Build a new character-level LSTM model that inherits the CNN model\n", + " - Load the DataLabeler from the DataProfiler\n", + " - Swap the existing CNN model with the new LSTM model\n", + " - Train the data labeler pipeline on a given dataset\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16624c48", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp" + ] + }, + { + "cell_type": "markdown", + "id": "e90728ab", + "metadata": {}, + "source": [ + "## Dataset" + ] + }, + { + "cell_type": "markdown", + "id": "3d61981c", + "metadata": {}, + "source": [ + "In this example, we use a structured dataset, the aws honeypot dataset, given in the test folder of the library. This dataset is first read by the Data Reader class of the Data Profiler, then split into training and test data to be used in the next sections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f031fe06", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(\"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\")\n", + "df_data = data.data\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df_data = df_data.sample(frac=1).reset_index(drop=True)\n", + "data_train = df_data[:int((1 - split_ratio) * len(df_data))]\n", + "data_test = df_data[int((1 - split_ratio) * len(df_data)):]\n", + "\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "745ed0d4", + "metadata": {}, + "source": [ + "## Implement a new character-level LSTM model" + ] + }, + { + "cell_type": "markdown", + "id": "7375b0c0", + "metadata": {}, + "source": [ + "This new model is inherited from `CharacterLevelCnnModel` class, with some modifications on the following functions\n", + "\n", + "`__init__`: to add new parameters for the LSTM model. The new parameters, `size_lstm`, `rec_dropout`, `activation`, `recurrent_activation`, specify number of LSTM layers, activation function, and recurrent dropout ratio.\n", + "\n", + "`_validate_parameters`: to add additional checks on the new parameters for the LSTM model\n", + "\n", + "`_construct_model`: to construct the new LSTM model with the desired architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8568fb49", + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \\\n", + " create_glove_char, build_embd_dictionary\n", + "from dataprofiler.labelers.base_model import BaseModel\n", + "\n", + "# CharacterLevelLstmModel derives from CharacterLevelCnnModel\n", + "#########################################################\n", + "#########################################################\n", + "class CharacterLevelLstmModel(CharacterLevelCnnModel):\n", + "\n", + " # boolean if the label mapping requires the mapping for index 0 reserved\n", + " requires_zero_mapping = True\n", + "\n", + " def __init__(self, label_mapping=None, parameters=None):\n", + " \"\"\"\n", + " LSTM Model Initializer\n", + " \"\"\"\n", + " \n", + " # parameter initialization\n", + " if not parameters:\n", + " parameters = {}\n", + " parameters.setdefault('max_length', 3400)\n", + " parameters.setdefault('max_char_encoding_id', 127)\n", + " parameters.setdefault('dim_embed', 64)\n", + " parameters.setdefault('size_fc', [32, 32])\n", + " parameters.setdefault('dropout', 0.1)\n", + " # new parameters for LSTM model\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault('size_lstm', [64])\n", + " parameters.setdefault('rec_dropout', 0.1)\n", + " parameters.setdefault('activation', \"tanh\")\n", + " parameters.setdefault('recurrent_activation', \"sigmoid\")\n", + " #########################################################\n", + " #########################################################\n", + " parameters.setdefault('default_label', \"UNKNOWN\")\n", + " parameters['pad_label'] = 'PAD'\n", + " self._epoch_id = 0\n", + "\n", + " # reconstruct flags for model\n", + " self._model_num_labels = 0\n", + " self._model_default_ind = -1\n", + "\n", + " BaseModel.__init__(self, label_mapping, parameters)\n", + "\n", + " def _validate_parameters(self, parameters):\n", + " \"\"\"\n", + " Validate the parameters sent in. Raise error if invalid parameters are\n", + " present.\n", + " \"\"\"\n", + " errors = []\n", + " list_of_necessary_params = ['max_length', 'max_char_encoding_id',\n", + " 'dim_embed', 'size_fc', 'dropout',\n", + " 'size_lstm', 'rec_dropout', 'activation', \n", + " 'recurrent_activation', 'default_label', \n", + " 'pad_label']\n", + " # Make sure the necessary parameters are present and valid.\n", + " for param in parameters:\n", + " if param in ['max_length', 'max_char_encoding_id', 'dim_embed',\n", + " 'size_conv']:\n", + " if not isinstance(parameters[param], (int, float)) \\\n", + " or parameters[param] < 0:\n", + " errors.append(param + \" must be a valid integer or float \"\n", + " \"greater than 0.\")\n", + " elif param in ['dropout', 'rec_dropout']: # additional check for rec_dropout\n", + " if not isinstance(parameters[param], (int, float)) \\\n", + " or parameters[param] < 0 or parameters[param] > 1:\n", + " errors.append(param + \" must be a valid integer or float \"\n", + " \"from 0 to 1.\")\n", + " elif param == 'size_fc' or param == 'size_lstm': # additional check for size_lstm\n", + " if not isinstance(parameters[param], list) \\\n", + " or len(parameters[param]) == 0:\n", + " errors.append(param + \" must be a non-empty list of \"\n", + " \"integers.\")\n", + " else:\n", + " for item in parameters[param]:\n", + " if not isinstance(item, int):\n", + " errors.append(param + \" must be a non-empty \"\n", + " \"list of integers.\")\n", + " break\n", + " elif param in ['default_label', 'activation', 'recurrent_activation']: # additional check for activation and recurrent_activation\n", + " if not isinstance(parameters[param], str):\n", + " error = str(param) + \" must be a string.\"\n", + " errors.append(error)\n", + "\n", + " # Error if there are extra parameters thrown in\n", + " for param in parameters:\n", + " if param not in list_of_necessary_params:\n", + " errors.append(param + \" is not an accepted parameter.\")\n", + " if errors:\n", + " raise ValueError('\\n'.join(errors))\n", + "\n", + " def _construct_model(self):\n", + " \"\"\"\n", + " Model constructor for the data labeler. This also serves as a weight\n", + " reset.\n", + "\n", + " :return: None\n", + " \"\"\"\n", + " num_labels = self.num_labels\n", + " default_ind = self.label_mapping[self._parameters['default_label']]\n", + "\n", + " # Reset model\n", + " tf.keras.backend.clear_session()\n", + "\n", + " # generate glove embedding\n", + " create_glove_char(self._parameters['dim_embed'])\n", + "\n", + " # generate model\n", + " self._model = tf.keras.models.Sequential()\n", + "\n", + " # default parameters\n", + " max_length = self._parameters['max_length']\n", + " max_char_encoding_id = self._parameters['max_char_encoding_id']\n", + "\n", + " # Encoding layer\n", + " def encoding_function(input_str):\n", + " char_in_vector = CharacterLevelLstmModel._char_encoding_layer(\n", + " input_str, max_char_encoding_id, max_length)\n", + " return char_in_vector\n", + "\n", + " self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))\n", + "\n", + " self._model.add(\n", + " tf.keras.layers.Lambda(encoding_function,\n", + " output_shape=tuple([max_length])))\n", + "\n", + " # Create a pre-trained weight matrix\n", + " # character encoding indices range from 0 to max_char_encoding_id,\n", + " # we add one extra index for out-of-vocabulary character\n", + " embed_file = os.path.join(\n", + " \"../dataprofiler/labelers\", \"embeddings/glove-reduced-{}D.txt\".format(\n", + " self._parameters['dim_embed']))\n", + " embedding_matrix = np.zeros((max_char_encoding_id + 2,\n", + " self._parameters['dim_embed']))\n", + " embedding_dict = build_embd_dictionary(embed_file)\n", + "\n", + " input_shape = tuple([max_length])\n", + " # Fill in the weight matrix: let pad and space be 0s\n", + " for ascii_num in range(max_char_encoding_id):\n", + " if chr(ascii_num) in embedding_dict:\n", + " embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]\n", + "\n", + " self._model.add(tf.keras.layers.Embedding(\n", + " max_char_encoding_id + 2,\n", + " self._parameters['dim_embed'],\n", + " weights=[embedding_matrix],\n", + " input_length=input_shape[0],\n", + " trainable=True))\n", + " \n", + " # Add the lstm layers\n", + " #########################################################\n", + " #########################################################\n", + " for size in self._parameters['size_lstm']:\n", + " self._model.add(\n", + " tf.keras.layers.LSTM(units=size, \n", + " recurrent_dropout=self._parameters['rec_dropout'], \n", + " activation=self._parameters['activation'],\n", + " recurrent_activation=self._parameters['recurrent_activation'],\n", + " return_sequences=True))\n", + " if self._parameters['dropout']:\n", + " self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))\n", + " #########################################################\n", + " #########################################################\n", + "\n", + " # Add the fully connected layers\n", + " for size in self._parameters['size_fc']:\n", + " self._model.add(\n", + " tf.keras.layers.Dense(units=size, activation='relu'))\n", + " if self._parameters['dropout']:\n", + " self._model.add(\n", + " tf.keras.layers.Dropout(self._parameters['dropout']))\n", + "\n", + " # Add the final Softmax layer\n", + " self._model.add(\n", + " tf.keras.layers.Dense(num_labels, activation='softmax'))\n", + "\n", + " # Output the model into a .pb file for TensorFlow\n", + " argmax_layer = tf.keras.backend.argmax(self._model.output)\n", + "\n", + " # Create confidence layers\n", + " final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(\n", + " num_labels, threshold=0.0, default_ind=default_ind)\n", + "\n", + " argmax_outputs = self._model.outputs + \\\n", + " [argmax_layer,\n", + " final_predicted_layer(argmax_layer, self._model.output)]\n", + " self._model = tf.keras.Model(self._model.inputs, argmax_outputs)\n", + "\n", + " # Compile the model\n", + " softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]\n", + " losses = {softmax_output_layer_name: \"categorical_crossentropy\"}\n", + "\n", + " # use f1 score metric\n", + " f1_score_training = F1Score(num_classes=num_labels, average='micro')\n", + " metrics = {softmax_output_layer_name: ['acc', f1_score_training]}\n", + "\n", + " self._model.compile(loss=losses,\n", + " optimizer=\"adam\",\n", + " metrics=metrics)\n", + "\n", + " self._epoch_id = 0\n", + " self._model_num_labels = num_labels\n", + " self._model_default_ind = default_ind\n" + ] + }, + { + "cell_type": "markdown", + "id": "d66bd25c", + "metadata": {}, + "source": [ + "## Integrate the new LSTM model to the DataLabeler" + ] + }, + { + "cell_type": "markdown", + "id": "479f407a", + "metadata": {}, + "source": [ + "Once the LSTM model is built, it replaces the existing model in the DataLabeler pipeline, which is then trained on the given dataset. Note that, as the DataLabeler is trained on the above tabular dataset, its label mapping is updated by the list of column names in that dataset while training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb482ffe", + "metadata": {}, + "outputs": [], + "source": [ + "# get labels from the given dataset\n", + "value_label_df = data_train.reset_index(drop=True).melt()\n", + "value_label_df.columns = [1, 0] # labels=1, values=0 in that order\n", + "value_label_df = value_label_df.astype(str)\n", + "labels = value_label_df[1].unique().tolist()\n", + "\n", + "# create a new LSTM model\n", + "# set default label (one of the column names) to the model\n", + "model = CharacterLevelLstmModel(label_mapping=labels, parameters={'default_label': 'comment'})\n", + "\n", + "# add the new LSTM model to the data labeler\n", + "data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "data_labeler.set_model(model)\n", + "\n", + "# set default label (one of the column names) to the preprocessor and postprocessor\n", + "processor_params = {'default_label': 'comment'}\n", + "data_labeler._preprocessor.set_params(**processor_params)\n", + "data_labeler._postprocessor.set_params(**processor_params)\n", + "\n", + "# train the data labeler\n", + "save_dirpath=\"data_labeler_saved\"\n", + "if not os.path.exists(save_dirpath):\n", + " os.makedirs(save_dirpath)\n", + "\n", + "epochs=2\n", + "data_labeler.fit(\n", + " x=value_label_df[0], y=value_label_df[1], labels=labels, epochs=epochs)\n", + "if save_dirpath:\n", + " data_labeler.save_to_disk(save_dirpath)" + ] + }, + { + "cell_type": "markdown", + "id": "14b78c69", + "metadata": {}, + "source": [ + "The trained Data Labeler is then used by the Data Profiler to provide the prediction on the new dataset. In this example, all options except data labeler are disabled for the sake of presenting data labeler functionality. The results are given in the columnar format where true column types are given in the first column, and the predicted column labels are given in the second column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdfcf1d2", + "metadata": {}, + "outputs": [], + "source": [ + "# predict with the data labeler object\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': data_labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "cc60ff8a", + "metadata": {}, + "source": [ + "In summary, users can define their own model, plug it in the DataLabeler pipeline, and train the labeler with the new dataset. Above, we show one example of adding the LSTM model to the pipeline. Interested users can implement other neural network models as desired with the same process." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/html/data_labeling.html b/docs/0.7.1/html/data_labeling.html new file mode 100644 index 000000000..e2754274b --- /dev/null +++ b/docs/0.7.1/html/data_labeling.html @@ -0,0 +1,604 @@ + + + + + + + + + Labeler (Sensitive Data) - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labeler (Sensitive Data)

+

In this library, the term data labeling refers to entity recognition.

+

Builtin to the data profiler is a classifier which evaluates the complex data types of the dataset. +For structured data, it determines the complex data type of each column. When +running the data profile, it uses the default data labeling model builtin to the +library. However, the data labeler allows users to train their own data labeler +as well.

+

Data Labels are determined per cell for structured data (column/row when +the profiler is used) or at the character level for unstructured data. This +is a list of the default labels.

+
    +
  • UNKNOWN

  • +
  • ADDRESS

  • +
  • BAN (bank account number, 10-18 digits)

  • +
  • CREDIT_CARD

  • +
  • EMAIL_ADDRESS

  • +
  • UUID

  • +
  • HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)

  • +
  • IPV4

  • +
  • IPV6

  • +
  • MAC_ADDRESS

  • +
  • PERSON

  • +
  • PHONE_NUMBER

  • +
  • SSN

  • +
  • URL

  • +
  • US_STATE

  • +
  • DRIVERS_LICENSE

  • +
  • DATE

  • +
  • TIME

  • +
  • DATETIME

  • +
  • INTEGER

  • +
  • FLOAT

  • +
  • QUANTITY

  • +
  • ORDINAL

  • +
+
+

Identify Entities in Structured Data

+

Makes predictions and identifying labels:

+
import dataprofiler as dp
+
+# load data and data labeler
+data = dp.Data("your_data.csv")
+data_labeler = dp.DataLabeler(labeler_type='structured')
+
+# make predictions and get labels per cell
+predictions = data_labeler.predict(data)
+
+
+
+
+

Identify Entities in Unstructured Data

+

Predict which class characters belong to in unstructured text:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(labeler_type='unstructured')
+
+# Example sample string, must be in an array (multiple arrays can be passed)
+sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
+          "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"]
+
+# Prediction what class each character belongs to
+model_predictions = data_labeler.predict(
+    sample, predict_options=dict(show_confidences=True))
+
+# Predictions / confidences are at the character level
+final_results = model_predictions["pred"]
+final_confidences = model_predictions["conf"]
+
+
+

It’s also possible to change output formats, output similar to a SpaCy format:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(labeler_type='unstructured', trainable=True)
+
+# Example sample string, must be in an array (multiple arrays can be passed)
+sample = ["Help\tJohn Macklemore\tneeds\tfood.\tPlease\tCall\t555-301-1234."
+          "\tHis\tssn\tis\tnot\t334-97-1234. I'm a BAN: 000043219499392912.\n"]
+
+# Set the output to the NER format (start position, end position, label)
+data_labeler.set_params(
+    { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } }
+)
+
+results = data_labeler.predict(sample)
+
+print(results)
+
+
+
+
+

Train a New Data Labeler

+

Mechanism for training your own data labeler on their own set of structured data +(tabular):

+
import dataprofiler as dp
+
+# Will need one column with a default label of UNKNOWN
+data = dp.Data("your_file.csv")
+
+data_labeler = dp.train_structured_labeler(
+    data=data,
+    save_dirpath="/path/to/save/labeler",
+    epochs=2
+)
+
+data_labeler.save_to_disk("my/save/path") # Saves the data labeler for reuse
+
+
+
+
+

Load an Existing Data Labeler

+

Mechanism for loading an existing data_labeler:

+
import dataprofiler as dp
+
+data_labeler = dp.DataLabeler(
+    labeler_type='structured', dirpath="/path/to/my/labeler")
+
+# get information about the parameters/inputs/output formats for the DataLabeler
+data_labeler.help()
+
+
+
+
+

Extending a Data Labeler with Transfer Learning

+

Extending or changing labels of a data labeler w/ transfer learning: +Note: By default, a labeler loaded will not be trainable. In order to load a +trainable DataLabeler, the user must set trainable=True or load a labeler +using the TrainableDataLabeler class.

+

The following illustrates how to change the labels:

+
import dataprofiler as dp
+
+labels = ['label1', 'label2', ...]  # new label set can also be an encoding dict
+data = dp.Data("your_file.csv")  # contains data with new labels
+
+# load default structured Data Labeler w/ trainable set to True
+data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# this will use transfer learning to retrain the data labeler on your new
+# dataset and labels.
+# NOTE: data must be in an acceptable format for the preprocessor to interpret.
+#       please refer to the preprocessor/model for the expected data format.
+#       Currently, the DataLabeler cannot take in Tabular data, but requires
+#       data to be ingested with two columns [X, y] where X is the samples and
+#       y is the labels.
+model_results = data_labeler.fit(x=data['samples'], y=data['labels'],
+                                 validation_split=0.2, epochs=2, labels=labels)
+
+# final_results, final_confidences are a list of results for each epoch
+epoch_id = 0
+final_results = model_results[epoch_id]["pred"]
+final_confidences = model_results[epoch_id]["conf"]
+
+
+

The following illustrates how to extend the labels:

+
import dataprofiler as dp
+
+new_labels = ['label1', 'label2', ...]
+data = dp.Data("your_file.csv")  # contains data with new labels
+
+# load default structured Data Labeler w/ trainable set to True
+data_labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# this will maintain current labels and model weights, but extend the model's
+# labels
+for label in new_labels:
+    data_labeler.add_label(label)
+
+# NOTE: a user can also add a label which maps to the same index as an existing
+# label
+# data_labeler.add_label(label, same_as='<label_name>')
+
+# For a trainable model, the user must then train the model to be able to
+# continue using the labeler since the model's graph has likely changed
+# NOTE: data must be in an acceptable format for the preprocessor to interpret.
+#       please refer to the preprocessor/model for the expected data format.
+#       Currently, the DataLabeler cannot take in Tabular data, but requires
+#       data to be ingested with two columns [X, y] where X is the samples and
+#       y is the labels.
+model_results = data_labeler.fit(x=data['samples'], y=data['labels'],
+                                 validation_split=0.2, epochs=2)
+
+# final_results, final_confidences are a list of results for each epoch
+epoch_id = 0
+final_results = model_results[epoch_id]["pred"]
+final_confidences = model_results[epoch_id]["conf"]
+
+
+

Changing pipeline parameters:

+
import dataprofiler as dp
+
+# load default Data Labeler
+data_labeler = dp.DataLabeler(labeler_type='structured')
+
+# change parameters of specific component
+data_labeler.preprocessor.set_params({'param1': 'value1'})
+
+# change multiple simultaneously.
+data_labeler.set_params({
+    'preprocessor':  {'param1': 'value1'},
+    'model':         {'param2': 'value2'},
+    'postprocessor': {'param3': 'value3'}
+})
+
+
+
+

Build Your Own Data Labeler

+

The DataLabeler has 3 main components: preprocessor, model, and postprocessor. +To create your own DataLabeler, each one would have to be created or an +existing component can be reused.

+

Given a set of the 3 components, you can construct your own DataLabeler:

+

Option for swapping out specific components of an existing labeler.

+
import dataprofiler as dp
+from dataprofiler.labelers.character_level_cnn_model import \
+    CharacterLevelCnnModel
+from dataprofiler.labelers.data_processing import \
+    StructCharPreprocessor, StructCharPostprocessor
+
+model = CharacterLevelCnnModel(...)
+preprocessor = StructCharPreprocessor(...)
+postprocessor = StructCharPostprocessor(...)
+
+data_labeler = dp.DataLabeler(labeler_type='structured')
+data_labeler.set_preprocessor(preprocessor)
+data_labeler.set_model(model)
+data_labeler.set_postprocessor(postprocessor)
+
+# check for basic compatibility between the processors and the model
+data_labeler.check_pipeline()
+
+
+
+
+
+

Model Component

+

In order to create your own model component for data labeling, you can utilize +the BaseModel class from dataprofiler.labelers.base_model and +overriding the abstract class methods.

+

Reviewing CharacterLevelCnnModel from +dataprofiler.labelers.character_level_cnn_model illustrates the functions +which need an override.

+
    +
  1. __init__: specifying default parameters and calling base __init__

  2. +
  3. _validate_parameters: validating parameters given by user during setting

  4. +
  5. _need_to_reconstruct_model: flag for when to reconstruct a model (i.e. +parameters change or labels change require a model reconstruction)

  6. +
  7. _construct_model: initial construction of the model given the parameters

  8. +
  9. _reconstruct_model: updates model architecture for new label set while +maintaining current model weights

  10. +
  11. fit: mechanism for the model to learn given training data

  12. +
  13. predict: mechanism for model to make predictions on data

  14. +
  15. details: prints a summary of the model construction

  16. +
  17. save_to_disk: saves model and model parameters to disk

  18. +
  19. load_from_disk: loads model given a path on disk

  20. +
+
+
+

Preprocessor Component

+

In order to create your own preprocessor component for data labeling, you can +utilize the BaseDataPreprocessor class +from dataprofiler.labelers.data_processing and override the abstract class +methods.

+

Reviewing StructCharPreprocessor from +dataprofiler.labelers.data_processing illustrates the functions which +need an override.

+
    +
  1. __init__: passing parameters to the base class and executing any +extraneous calculations to be saved as parameters

  2. +
  3. _validate_parameters: validating parameters given by user during +setting

  4. +
  5. process: takes in the user data and converts it into an digestible, +iterable format for the model

  6. +
  7. set_params (optional): if a parameter requires processing before setting, +a user can override this function to assist with setting the parameter

  8. +
  9. _save_processor (optional): if a parameter is not JSON serializable, a +user can override this function to assist in saving the processor and its +parameters

  10. +
  11. load_from_disk (optional): if a parameter(s) is not JSON serializable, a +user can override this function to assist in loading the processor

  12. +
+
+
+

Postprocessor Component

+

The postprocessor is nearly identical to the preprocessor except it handles +the output of the model for processing. In order to create your own +postprocessor component for data labeling, you can utilize the +BaseDataPostprocessor class from dataprofiler.labelers.data_processing +and override the abstract class methods.

+

Reviewing StructCharPostprocessor from +dataprofiler.labelers.data_processing illustrates the functions which +need an override.

+
    +
  1. __init__: passing parameters to the base class and executing any +extraneous calculations to be saved as parameters

  2. +
  3. _validate_parameters: validating parameters given by user during +setting

  4. +
  5. process: takes in the output of the model and processes for output to +the user

  6. +
  7. set_params (optional): if a parameter requires processing before setting, +a user can override this function to assist with setting the parameter

  8. +
  9. _save_processor (optional): if a parameter is not JSON serializable, a +user can override this function to assist in saving the processor and its +parameters

  10. +
  11. load_from_disk (optional): if a parameter(s) is not JSON serializable, a +user can override this function to assist in loading the processor

  12. +
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/data_reader.html b/docs/0.7.1/html/data_reader.html new file mode 100644 index 000000000..968f49578 --- /dev/null +++ b/docs/0.7.1/html/data_reader.html @@ -0,0 +1,1033 @@ + + + + + + + + + Intro to Data Readers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Intro to Data Readers

+

Within the Data Profiler, there are 5 data reader classes:

+
    +
  • CSVData (delimited data: CSV, TSV, etc.)

  • +
  • JSONData

  • +
  • ParquetData

  • +
  • AVROData

  • +
  • TextData

  • +
+

Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the Data class.

+
import dataprofiler as dp
+data = dp.Data('/path/to/mydata.abc')  # auto detects and reads your data
+
+
+
+

Automatically reading and detecting data

+

Below is a demonstration of utilizing the Data class which automatically detects the type of data for a given file and reads it automatically.

+
+
[ ]:
+
+
+
+import os
+import sys
+
+try:
+    sys.path.insert(0, '..')
+    import dataprofiler as dp
+except ImportError:
+    import dataprofiler as dp
+
+
+
+
+
[ ]:
+
+
+
+# use data reader to read input data with different file types
+data_folder = "../dataprofiler/tests/data"
+csv_files = [
+    "csv/aws_honeypot_marx_geo.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+json_files = [
+    "json/complex_nested.json",
+    "json/honeypot_intentially_mislabeled_file.csv", # json file with the .csv extension
+]
+parquet_files = [
+    "parquet/nation.dict.parquet",
+    "parquet/nation.plain.intentionally_mislabled_file.csv", # parquet file with the .csv extension
+]
+avro_files = [
+    "avro/userdata1.avro",
+    "avro/userdata1_intentionally_mislabled_file.json", # avro file with the .json extension
+]
+text_files = [
+    "txt/discussion_reddit.txt",
+]
+all_files = csv_files + json_files + parquet_files + avro_files + text_files
+print('filepath' + ' ' * 58 + 'data type')
+print('='*80)
+for file in all_files:
+    filepath = os.path.join(data_folder, file)
+    data = dp.Data(filepath)
+    print("{:<65} {:<15}".format(file, data.data_type))
+print("\n")
+
+
+
+
+
+

Specifying detection options of Data and loading pandas.DataFrame

+

The Data class also gives the ability to set options or if the user wants to load their data with specific requirements. Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html

+
import dataprofiler as dp
+
+options = {...}  # allowed options are specified for each data reader.
+data = dp.Data(data, options=options)
+
+
+

Later in this tutorial, the options for the CSVData class will be discussed.

+

Additionally, a user can directly load a pandas.DataFrame as any data reader they choose.

+
+
[ ]:
+
+
+
+import pandas as pd
+from dataprofiler.data_readers.csv_data import CSVData
+
+
+df = pd.DataFrame(['my', 'random', 'data'])
+
+# specify via the `Data` class
+data = dp.Data(data=df, data_type='csv')
+print('Data Type: ', data.data_type)
+
+# specifically use the CSVData class
+data = CSVData(data=df)
+print('Data Type: ', data.data_type)
+
+
+
+
+
+

Accessing data and attributes

+

Once loaded, the data can be accessed via the data property of the object. Additional information about the data loaded may differ between data readers.

+

For this example we will focus on CSVData.

+
+
[ ]:
+
+
+
+filepath = "../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv"
+data = dp.Data(filepath)
+print('Data Type: ', data.data_type)
+print('Data Filepath: ', data.input_file_path)
+print('File Encoding: ', data.file_encoding)
+print('Data Length (two techniques): ', len(data), data.length)
+print("Data Access:")
+data.data
+
+
+
+
+
+

Checking data file types with is_match

+

Each data reader has a class method is_match which determines whether or not a dataset is of a given data type.

+
CSVData.is_match
+JSONData.is_match
+ParquetData.is_match
+AVROData.is_match
+TextData.is_match
+
+
+
+
[ ]:
+
+
+
+# supplemental function
+def add_true_false_color(value):
+    """Converts True to green and False to red in printed text."""
+    if value:
+        return "\x1b[92m  " + str(is_match) + "\x1b[0m"
+    return "\x1b[31m " + str(is_match) + "\x1b[0m"
+
+
+
+
+
[ ]:
+
+
+
+from dataprofiler.data_readers.csv_data import CSVData
+
+
+non_csv_files = [
+    'json/iris-utf-8.json',
+    'json/honeypot_intentially_mislabeled_file.csv',
+    'parquet/titanic.parq',
+    'parquet/nation.plain.intentionally_mislabled_file.csv',
+    'txt/code.txt',
+    'txt/sentence.txt',
+    'avro/users.avro',
+    'avro/snappy_compressed_intentionally_mislabeled_file.csv',
+]
+
+print("Is the file a CSV?")
+print('=' * 80)
+for file in csv_files:
+    filepath = os.path.join(data_folder, file)
+    is_match = CSVData.is_match(filepath)
+    print(add_true_false_color(is_match), ':', file)
+    print('=' * 80)
+
+for file in non_csv_files:
+    filepath = os.path.join(data_folder, file)
+    is_match = CSVData.is_match(filepath)
+    print(add_true_false_color(is_match), ':', file)
+    print('=' * 80)
+
+
+
+
+
+

Reloading data after altering options with reload

+

There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:

+
1. The options were not correctly determined
+2. The options were loaded correctly but a change is desired.
+
+
+

In the example below, the data_format for reading the data is changed and the data is then reloaded.

+
+
[ ]:
+
+
+
+filepath = "../dataprofiler/tests/data/csv/diamonds.csv"
+
+data = dp.Data(filepath)
+print('original data:')
+print('=' * 80)
+print(data.data[:5])
+
+print()
+data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})
+print('reloaded data:')
+print('=' * 80)
+data.data[:5]
+
+
+
+
+
+

A deeper dive into CSVData

+

The rest of this tutorial will focus on how to use the data reader class: CSVData. The CSVData class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the ,. E.g. from the diamonds.csv dataset:

+
carat,cut,color,clarity,depth,table,price,x,y,z
+0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
+0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
+0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
+0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
+0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75
+
+
+

However, the delimiter can be any character. Additionally, a quotechar, commonly ", can be specified which allows a delimiter to be contained within a column value. E.g. from the blogposts.csv dataset:

+
Blog Post,Date,Subject,Field
+"Monty Hall, meet Game Theory",4/13/2014,Statistics,Mathematics
+Gaussian Quadrature,4/13/2014,Algorithms,Mathematics
+
+
+

Notice how "Monty Hall, meet Game Theory" is contained by the quotechar because it contains the delimiter value ,.

+

These delimiter dataset parameters (and more) can be automatically determined by the CSVData data reader, however they can also be set via the options as demonstrated later in this tutorial.

+
+
+

Intro to the CSVData data reader

+

Previously, it was shown that CSVData may automatically be detected using Data or can be manually specified by the user:

+
import dataprofiler as dp
+from dataprofiler.data_readers.csv_data import CSVData
+
+data = dp.Data(filepath)
+data = CSVData(filepath)
+
+
+
+
[ ]:
+
+
+
+# use data reader to read delimited data
+data_folder = "../dataprofiler/tests/data"
+csv_files = [
+    "csv/diamonds.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+
+for file in csv_files:
+    data = CSVData(os.path.join(data_folder, file))
+    print(data.data.head())
+    print('=' * 80)
+
+
+
+
+
+

CSVData Options

+

As mentioned preivously, CSVData has options that can be set to finetune its detection or to ensure the data is being read in a specific manner. The options for CSVData are detailed below:

+
    +
  • delimiter - delimiter used to decipher the csv input file

  • +
  • quotechar - quote character used in the delimited file

  • +
  • header - location of the header in the file.

  • +
  • data_format - user selected format in which to return data can only be of specified types

  • +
  • selected_columns - columns being selected from the entire dataset

  • +
+
+
[ ]:
+
+
+
+# options are set via a dictionary object in which the parameters are specified.
+# these are the default values for each option
+options = {
+    "delimiter": ",",
+    "quotechar": '"',
+    "header": 'auto',
+    "data_format": "dataframe",  # type: str, choices: "dataframe", "records"
+    "selected_columns": list(),
+}
+
+
+
+
+
+

Options: delimiter and quotechar

+

Below, both the auto detection and use of options will be illustrated for delimiter and quotechar.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(filepath)  # or use CSVData
+print('Auto detected')
+print('=' * 80)
+print('delimiter: ', data.delimiter)
+print('quotechar: ', data.quotechar)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+options = {'delimiter': '@', 'quotechar': "'"}
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('manually set')
+print('=' * 80)
+print('delimiter: ', data.delimiter)
+print('quotechar: ', data.quotechar)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+# intentional failure with incorrect options
+options = {'delimiter': ',', 'quotechar': '"'}
+
+# will be interepted as TextData because the delimtier and quotechar were incorrect
+data = dp.Data(filepath, options=options)
+print('intentional faliure set')
+print('=' * 80)
+try:
+    print('delimiter: ', data.delimiter)  # attribute error raised here, bc TextData, not CSVData
+    print('quotechar: ', data.quotechar)
+
+    # should not reach this or something went wrong
+    raise Exception('Should have failed because this is detected as TextData.')
+except AttributeError:
+    print('When data_type is not set or the CSVData is not set, it will fail over to the\n'
+          'next best reader. In this case it is "TextData"\n')
+data.data
+
+
+
+
+
+

Options: header

+

Below, both the auto detection and use of options will be illustrated for header.

+

Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'header': 'auto'}  # auto detected (default value)
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('Data Header:', data.header)
+print('=' * 80)
+data.data.head()
+
+
+
+
+
[ ]:
+
+
+
+options = {'header': 2}  # intentionally set incorrectly at value 2
+data = dp.Data(filepath, options=options)  # or use CSVData
+print('Data Header:', data.header)
+print('=' * 80)
+data.data.head()
+
+
+
+
+
+

Options: data_format

+

For CSVData, the data_format option can have the following values:

+
    +
  • dataframe - (default) loads the dataset as a pandas.DataFrame

  • +
  • records - loads the data as rows of text values, the extra parameter record_samples_per_line how many rows are combined into a single line

  • +
+

dataframe is used for conducting structured profiling of the dataset while records is for unstructured profiling.

+

Below, both the auto detection and use of options will be illustrated for data_format.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/diamonds.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'data_format': 'dataframe'}  # default
+data = dp.Data(filepath, options=options)  # or use CSVData
+data.data[:5]
+
+
+
+
+
[ ]:
+
+
+
+options = {'data_format': 'records', 'record_samples_per_line': 1}
+data = dp.Data(filepath, options=options)
+data.data[:5]
+
+
+
+
+
+

Options: selected columns

+

By default, all columns of a dataset will be read and loaded into the data reader. However, selected_columns can be set to only load columns which the user requests.

+
+
[ ]:
+
+
+
+# display the data we are reading
+filepath = "../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv"
+num_lines = 10
+with open(filepath) as fp:
+    print(''.join(fp.readlines()[:num_lines]))
+
+
+
+
+
[ ]:
+
+
+
+options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}
+data = dp.Data(filepath, options=options)
+data.data.head()
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/data_reader.ipynb b/docs/0.7.1/html/data_reader.ipynb new file mode 100644 index 000000000..731e46d25 --- /dev/null +++ b/docs/0.7.1/html/data_reader.ipynb @@ -0,0 +1,621 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4d79832-59ab-410a-ad6d-fbba01a3f0d3", + "metadata": {}, + "source": [ + "# Intro to Data Readers\n", + "Within the Data Profiler, there are 5 data reader classes:\n", + "\n", + " * CSVData (delimited data: CSV, TSV, etc.)\n", + " * JSONData\n", + " * ParquetData\n", + " * AVROData\n", + " * TextData\n", + " \n", + "Each of these classes can be used to read data individually, however the Data Profiler provides the unique capability of auto detecting what data you have and reading it automatically by using the `Data` class.\n", + "```python\n", + "import dataprofiler as dp\n", + "data = dp.Data('/path/to/mydata.abc') # auto detects and reads your data\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f2315666-20be-4937-9f9a-26d42dc135e2", + "metadata": { + "tags": [] + }, + "source": [ + "## Automatically reading and detecting data\n", + "\n", + "Below is a demonstration of utilizing the `Data` class which automatically detects the type of data for a given file and reads it automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99e61c6c-43b8-4700-b627-759b5ef8bdda", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "try:\n", + " sys.path.insert(0, '..')\n", + " import dataprofiler as dp\n", + "except ImportError:\n", + " import dataprofiler as dp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8821ad8d-b2c0-489c-ae6a-54c11b7f0a08", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "all_files = csv_files + json_files + parquet_files + avro_files + text_files\n", + "print('filepath' + ' ' * 58 + 'data type')\n", + "print('='*80)\n", + "for file in all_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " data = dp.Data(filepath)\n", + " print(\"{:<65} {:<15}\".format(file, data.data_type))\n", + "print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "77f8ef2d-5aaf-44d6-b6d1-bf14f7eb7aa6", + "metadata": {}, + "source": [ + "## Specifying detection options of `Data` and loading `pandas.DataFrame`\n", + "\n", + "The `Data` class also gives the ability to set options or if the user wants to load their data with specific requirements.\n", + "Options for each data reader are specified in the docs: https://capitalone.github.io/DataProfiler/docs/0.4.4/html/dataprofiler.data_readers.html\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "\n", + "options = {...} # allowed options are specified for each data reader.\n", + "data = dp.Data(data, options=options)\n", + "```\n", + "Later in this tutorial, the options for the CSVData class will be discussed.\n", + "\n", + "Additionally, a user can directly load a `pandas.DataFrame` as any data reader they choose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b925d4e-ca94-4913-9acf-26a883585e85", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "df = pd.DataFrame(['my', 'random', 'data'])\n", + "\n", + "# specify via the `Data` class\n", + "data = dp.Data(data=df, data_type='csv')\n", + "print('Data Type: ', data.data_type)\n", + "\n", + "# specifically use the CSVData class\n", + "data = CSVData(data=df)\n", + "print('Data Type: ', data.data_type)" + ] + }, + { + "cell_type": "markdown", + "id": "52c3c3ac-c241-4d91-8ac7-b3d28ffd19c3", + "metadata": {}, + "source": [ + "## Accessing data and attributes\n", + "\n", + "Once loaded, the data can be accessed via the `data` property of the object. Additional information about the data loaded may differ between data readers.\n", + "\n", + "For this example we will focus on `CSVData`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09fa5929-e710-4107-9313-1370ab639c9c", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "data = dp.Data(filepath)\n", + "print('Data Type: ', data.data_type)\n", + "print('Data Filepath: ', data.input_file_path)\n", + "print('File Encoding: ', data.file_encoding)\n", + "print('Data Length (two techniques): ', len(data), data.length)\n", + "print(\"Data Access:\")\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "b98be971-4768-479d-9e54-00f05a6fb790", + "metadata": {}, + "source": [ + "## Checking data file types with `is_match`\n", + "\n", + "Each data reader has a class method `is_match` which determines whether or not a dataset is of a given data type.\n", + "```python\n", + "CSVData.is_match\n", + "JSONData.is_match\n", + "ParquetData.is_match\n", + "AVROData.is_match\n", + "TextData.is_match\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104a32c1-1d50-4aa5-94ce-b2e72de38476", + "metadata": {}, + "outputs": [], + "source": [ + "# supplemental function\n", + "def add_true_false_color(value):\n", + " \"\"\"Converts True to green and False to red in printed text.\"\"\"\n", + " if value:\n", + " return \"\\x1b[92m \" + str(is_match) + \"\\x1b[0m\"\n", + " return \"\\x1b[31m \" + str(is_match) + \"\\x1b[0m\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06868d90-2726-4096-a6da-3866174e6671", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "\n", + "non_csv_files = [\n", + " 'json/iris-utf-8.json',\n", + " 'json/honeypot_intentially_mislabeled_file.csv',\n", + " 'parquet/titanic.parq',\n", + " 'parquet/nation.plain.intentionally_mislabled_file.csv',\n", + " 'txt/code.txt',\n", + " 'txt/sentence.txt',\n", + " 'avro/users.avro',\n", + " 'avro/snappy_compressed_intentionally_mislabeled_file.csv',\n", + "]\n", + "\n", + "print(\"Is the file a CSV?\")\n", + "print('=' * 80)\n", + "for file in csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)\n", + " \n", + "for file in non_csv_files:\n", + " filepath = os.path.join(data_folder, file)\n", + " is_match = CSVData.is_match(filepath)\n", + " print(add_true_false_color(is_match), ':', file)\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "38889990-8e19-4114-a4f3-dc2af938e29d", + "metadata": {}, + "source": [ + "## Reloading data after altering options with `reload`\n", + "\n", + "There are two cases for using the reload function, both of which require the data type to have been interpreted correctly:\n", + "\n", + " 1. The options were not correctly determined\n", + " 2. The options were loaded correctly but a change is desired.\n", + " \n", + "In the example below, the `data_format` for reading the data is changed and the data is then reloaded." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01870e8d-45ee-4f33-a088-4453c7ffc7c2", + "metadata": {}, + "outputs": [], + "source": [ + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "\n", + "data = dp.Data(filepath)\n", + "print('original data:')\n", + "print('=' * 80)\n", + "print(data.data[:5])\n", + "\n", + "print()\n", + "data.reload(options={'data_format': 'records', 'record_samples_per_line': 1})\n", + "print('reloaded data:')\n", + "print('=' * 80)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e2285f19-9b34-4484-beaa-79df890b2825", + "metadata": {}, + "source": [ + "## A deeper dive into `CSVData`\n", + "\n", + "The rest of this tutorial will focus on how to use the data reader class: `CSVData`. The `CSVData` class is used for reading delimited data. Delimited data are datasets which have their columns specified by a specific character, commonly the `,`. E.g. from the `diamonds.csv` dataset:\n", + "```\n", + "carat,cut,color,clarity,depth,table,price,x,y,z\n", + "0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43\n", + "0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31\n", + "0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31\n", + "0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63\n", + "0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75\n", + "```\n", + "\n", + "However, the delimiter can be any character. Additionally, a `quotechar`, commonly `\"`, can be specified which allows a delimiter to be contained within a column value.\n", + "E.g. from the `blogposts.csv` dataset:\n", + "```\n", + "Blog Post,Date,Subject,Field\n", + "\"Monty Hall, meet Game Theory\",4/13/2014,Statistics,Mathematics\n", + "Gaussian Quadrature,4/13/2014,Algorithms,Mathematics\n", + "```\n", + "Notice how `\"Monty Hall, meet Game Theory\"` is contained by the quotechar because it contains the delimiter value `,`.\n", + "\n", + "These delimiter dataset parameters (and more) can be automatically determined by the `CSVData` data reader, however they can also be set via the options as demonstrated later in this tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "cccb6bf9-7fb8-46b8-992e-9caacb7ab3a8", + "metadata": {}, + "source": [ + "## Intro to the `CSVData` data reader\n", + "\n", + "Previously, it was shown that `CSVData` may automatically be detected using `Data` or can be manually specified by the user:\n", + "\n", + "```python\n", + "import dataprofiler as dp\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "\n", + "data = dp.Data(filepath)\n", + "data = CSVData(filepath)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e25f5130-4f19-40c5-9d13-549a04f1aef5", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read delimited data \n", + "data_folder = \"../dataprofiler/tests/data\"\n", + "csv_files = [\n", + " \"csv/diamonds.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "\n", + "for file in csv_files:\n", + " data = CSVData(os.path.join(data_folder, file))\n", + " print(data.data.head())\n", + " print('=' * 80)" + ] + }, + { + "cell_type": "markdown", + "id": "8940de56-1417-4bf6-af87-9d4d00b9a631", + "metadata": {}, + "source": [ + "## CSVData Options\n", + "\n", + "As mentioned preivously, `CSVData` has options that can be set to finetune its detection or to ensure the data is being read in a specific manner.\n", + "The options for `CSVData` are detailed below:\n", + "\n", + " * delimiter - delimiter used to decipher the csv input file\n", + " * quotechar - quote character used in the delimited file\n", + " * header - location of the header in the file.\n", + " * data_format - user selected format in which to return data can only be of specified types\n", + " * selected_columns - columns being selected from the entire dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d74f2e8-0ec3-4e93-8778-0a5f013e0cdb", + "metadata": {}, + "outputs": [], + "source": [ + "# options are set via a dictionary object in which the parameters are specified.\n", + "# these are the default values for each option\n", + "options = {\n", + " \"delimiter\": \",\",\n", + " \"quotechar\": '\"',\n", + " \"header\": 'auto',\n", + " \"data_format\": \"dataframe\", # type: str, choices: \"dataframe\", \"records\"\n", + " \"selected_columns\": list(),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9af108a1-ffe6-4c3a-82cc-833b1a3b57a1", + "metadata": {}, + "source": [ + "## Options: delimiter and quotechar\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `delimiter` and `quotechar`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570e20c3-198e-4356-98d3-92eb9655ef4e", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/daily-activity-sheet-@-singlequote.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98385148-861e-4eb1-ba8d-e93120515401", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(filepath) # or use CSVData\n", + "print('Auto detected')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f5d9306-d90a-4fc6-85a7-a0d535fe2d80", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'delimiter': '@', 'quotechar': \"'\"}\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('manually set')\n", + "print('=' * 80)\n", + "print('delimiter: ', data.delimiter)\n", + "print('quotechar: ', data.quotechar)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7bfa60f-b5b9-48a5-adc5-3937aed145da", + "metadata": {}, + "outputs": [], + "source": [ + "# intentional failure with incorrect options\n", + "options = {'delimiter': ',', 'quotechar': '\"'}\n", + "\n", + "# will be interepted as TextData because the delimtier and quotechar were incorrect\n", + "data = dp.Data(filepath, options=options)\n", + "print('intentional faliure set')\n", + "print('=' * 80)\n", + "try:\n", + " print('delimiter: ', data.delimiter) # attribute error raised here, bc TextData, not CSVData\n", + " print('quotechar: ', data.quotechar)\n", + " \n", + " # should not reach this or something went wrong\n", + " raise Exception('Should have failed because this is detected as TextData.')\n", + "except AttributeError:\n", + " print('When data_type is not set or the CSVData is not set, it will fail over to the\\n'\n", + " 'next best reader. In this case it is \"TextData\"\\n')\n", + "data.data" + ] + }, + { + "cell_type": "markdown", + "id": "eeb41c7c-8319-40a3-9d87-88edbb3c5290", + "metadata": {}, + "source": [ + "## Options: header\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `header`.\n", + "\n", + "Notice how in the manually set mechanism, we are intentionally setting the header incorrectly to illustrate what happens." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16a927ef-1ba8-4bf2-ae40-2a9909030609", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/sparse-first-and-last-column-header-and-author-description.txt\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0701d7bf-2de0-4dce-8f09-7f0cddd1132c", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 'auto'} # auto detected (default value)\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8642a0a-367a-44c6-b611-b89d97b29f85", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'header': 2} # intentionally set incorrectly at value 2\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "print('Data Header:', data.header)\n", + "print('=' * 80)\n", + "data.data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d6e3f640-c809-4eb6-9571-30065821615e", + "metadata": {}, + "source": [ + "## Options: data_format\n", + "\n", + "For CSVData, the `data_format` option can have the following values:\n", + "\n", + " * dataframe - (default) loads the dataset as a pandas.DataFrame\n", + " * records - loads the data as rows of text values, the extra parameter `record_samples_per_line` how many rows are combined into a single line\n", + " \n", + "`dataframe` is used for conducting **structured profiling** of the dataset while `records` is for **unstructured profiling**.\n", + "\n", + "Below, both the auto detection and use of options will be illustrated for `data_format`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "146109ea-a554-4766-bb19-78c116d2a8dd", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/diamonds.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dceac967-d326-4064-ba1c-87a1146c9d72", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'dataframe'} # default\n", + "data = dp.Data(filepath, options=options) # or use CSVData\n", + "data.data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c25524f-ef23-4e06-9023-842c64c2640e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'data_format': 'records', 'record_samples_per_line': 1}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "d45f3ed6-ddcd-4bf3-95bc-09f23eb94c97", + "metadata": {}, + "source": [ + "## Options: selected columns\n", + "\n", + "By default, all columns of a dataset will be read and loaded into the data reader. However, `selected_columns` can be set to only load columns which the user requests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9b45e18-93c6-42e6-b978-af51574307eb", + "metadata": {}, + "outputs": [], + "source": [ + "# display the data we are reading\n", + "filepath = \"../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv\"\n", + "num_lines = 10\n", + "with open(filepath) as fp:\n", + " print(''.join(fp.readlines()[:num_lines]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "018f3f4d-32ac-411a-9918-bae78aff0b0e", + "metadata": {}, + "outputs": [], + "source": [ + "options = {'selected_columns': ['datetime', 'host', 'src', 'proto']}\n", + "data = dp.Data(filepath, options=options)\n", + "data.data.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/html/data_readers.html b/docs/0.7.1/html/data_readers.html new file mode 100644 index 000000000..e90112c87 --- /dev/null +++ b/docs/0.7.1/html/data_readers.html @@ -0,0 +1,416 @@ + + + + + + + + + Data Readers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Readers

+

The Data class itself will identify then output one of the following Data class types. +Using the data reader is easy, just pass it through the Data object.

+
import dataprofiler as dp
+data = dp.Data("your_file.csv")
+
+
+

The supported file types are:

+
    +
  • CSV file (or any delimited file)

  • +
  • JSON object

  • +
  • Avro file

  • +
  • Parquet file

  • +
  • Text file

  • +
  • Pandas DataFrame

  • +
  • A URL that points to one of the supported file types above

  • +
+

It’s also possible to specifically call one of the data classes such as the following command:

+
from dataprofiler.data_readers.csv_data import CSVData
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+
+

Additionally any of the data classes can be loaded using a URL:

+
import dataprofiler as dp
+data = dp.Data("https://you_website.com/your_file.file", options={"verify_ssl": "True"})
+
+
+

Below are descriptions of the various Data classes and the available options.

+
+

CSVData

+

Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter.

+

CSVData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • delimiter - Must be a string, for example “delimiter”: “,”

  • +
  • data_format - must be a string, possible choices: “dataframe”, “records”

  • +
  • selected_columns - columns being selected from the entire dataset, must be a +list [“column 1”, “ssn”]

  • +
  • header - Define the header, for example

    +
      +
    • “header”: ‘auto’ for auto detection

    • +
    • “header”: None for no header

    • +
    • “header”: <INT> to specify the header row (0 based index)

    • +
    +
  • +
+
+
+

JSONData

+

Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. JSON data can be +accessed via the “data” property, the “metadata” property, and the +“data_and_metadata” property.

+

JSONData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “json”, “flattened_dataframe”

    +
      +
    • “flattened_dataframe” is best used for JSON structure typically found in data streams that contain +nested lists of dictionaries and a payload. For example: {“data”: [ columns ], “response”: 200}

    • +
    +
  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
  • payload_keys - The dictionary keys for the payload of the JSON, typically called “data” +or “payload”. Defaults to [“data”, “payload”, “response”].

  • +
+
+
+

AVROData

+

Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter.

+

AVROData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “avro”, “json”, “flattened_dataframe”

    +
      +
    • “flattened_dataframe” is best used for AVROs with a JSON structure typically found in data streams that contain +nested lists of dictionaries and a payload. For example: {“data”: [ columns ], “response”: 200}

    • +
    +
  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
+
+
+

ParquetData

+

Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter.

+

ParquetData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format - must be a string, choices: “dataframe”, “records”, “json”

  • +
  • selected_keys - columns being selected from the entire dataset, must be a list [“column 1”, “ssn”]

  • +
+
+
+

TextData

+

Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter.

+

TextData(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • data_format: user selected format in which to return data. Currently only supports “text”.

  • +
  • samples_per_line - chunks by which to read in the specified dataset

  • +
+
+
+

Data Using a URL

+

Data class for loading datasets of any type using a URL. Specified by passing in +any valid URL that points to one of the valid data types. Options pertaining the +URL may also be specified using the options dict parameter.

+

Data(input_file_path=None, data=None, options=None)

+

Possible options:

+
    +
  • verify_ssl: must be a boolean string, choices: “True”, “False”. Set to “True” by default.

  • +
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.avro_data.html b/docs/0.7.1/html/dataprofiler.data_readers.avro_data.html new file mode 100644 index 000000000..033b5786c --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.avro_data.html @@ -0,0 +1,393 @@ + + + + + + + + + Avro Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Avro Data

+
+
+class dataprofiler.data_readers.avro_data.AVROData(input_file_path=None, data=None, options=None)
+

Bases: dataprofiler.data_readers.json_data.JSONData, dataprofiler.data_readers.base_data.BaseData

+

AVROData class to save and load spreadsheet data

+

Data class for loading datasets of type AVRO. Can be specified by +passing in memory data or via a file path. Options pertaining the AVRO +may also be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "avro"
+    selected_keys= type: list(str)
+)
+
+
+

data_format: user selected format in which to return data can only be of specified types +selected_keys: keys being selected from the entire dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = 'avro'
+
+
+
+property file_encoding
+

Set file encoding to None since not detected for avro.

+
+
+
+classmethod is_match(file_path, options=None)
+

Test the given file to check if the file has valid +AVRO format or not.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – avro read options

  • +
+
+
Returns
+

is file a avro file or not

+
+
Return type
+

bool

+
+
+
+
+
+property data
+
+
+
+property data_and_metadata
+

Returns a data frame that joins the data and the metadata.

+
+
+
+property data_format
+
+
+
+get_batch_generator(batch_size)
+
+
+
+info = None
+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+property metadata
+

Returns a data frame that contains the metadata

+
+
+
+reload(input_file_path=None, data=None, options=None)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property selected_keys
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.base_data.html b/docs/0.7.1/html/dataprofiler.data_readers.base_data.html new file mode 100644 index 000000000..eaa2f8878 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.base_data.html @@ -0,0 +1,352 @@ + + + + + + + + + Base Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Data

+
+
+class dataprofiler.data_readers.base_data.BaseData(input_file_path, data, options)
+

Bases: object

+

Abstract class for data loading and saving

+

Base class for loading a dataset. Options can be specified and maybe +more specific to the subclasses.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = None
+
+
+
+info = None
+
+
+
+property data
+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+property data_format
+
+
+
+property file_encoding
+
+
+
+get_batch_generator(batch_size)
+
+
+
+classmethod is_match(input_file_path, options)
+
+
+
+reload(input_file_path, data, options)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.csv_data.html b/docs/0.7.1/html/dataprofiler.data_readers.csv_data.html new file mode 100644 index 000000000..fe57db38a --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.csv_data.html @@ -0,0 +1,409 @@ + + + + + + + + + CSV Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

CSV Data

+
+
+class dataprofiler.data_readers.csv_data.CSVData(input_file_path=None, data=None, options=None)
+

Bases: dataprofiler.data_readers.structured_mixins.SpreadSheetDataMixin, dataprofiler.data_readers.base_data.BaseData

+

SpreadsheetData class to save and load spreadsheet data

+

Data class for loading datasets of type CSV. Can be specified by passing +in memory data or via a file path. Options pertaining the CSV may also +be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    delimiter= type: str
+    data_format= type: str, choices: "dataframe", "records"
+    record_samples_per_line= type: int (only for "records")
+    selected_columns= type: list(str)
+    header= type: any
+)
+
+
+

delimiter: delimiter used to decipher the csv input file +data_format: user selected format in which to return data +can only be of specified types: +``` +dataframe - (default) loads the dataset as a pandas.DataFrame +records - loads the data as rows of text values, the extra parameter

+
+

“record_samples_per_line” determines how many rows are combined into +a single line

+
+

``` +selected_columns: columns being selected from the entire dataset +header: location of the header in the file +quotechar: quote character used in the delimited file

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = 'csv'
+
+
+
+property selected_columns
+
+
+
+property delimiter
+
+
+
+property quotechar
+
+
+
+property header
+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+property data
+
+
+
+property data_format
+
+
+
+property file_encoding
+
+
+
+get_batch_generator(batch_size)
+
+
+
+info = None
+
+
+
+classmethod is_match(file_path, options=None)
+

Test the first 1000 lines of a given file to check if the file has valid +delimited format or not.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – delimiter read options dict(delimiter=”,”)

  • +
+
+
Returns
+

is file a csv file or not

+
+
Return type
+

bool

+
+
+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+reload(input_file_path=None, data=None, options=None)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.data.html b/docs/0.7.1/html/dataprofiler.data_readers.data.html new file mode 100644 index 000000000..1d7f1a71f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.data.html @@ -0,0 +1,296 @@ + + + + + + + + + Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data

+
+
+class dataprofiler.data_readers.data.Data(input_file_path=None, data=None, data_type=None, options=None)
+

Bases: object

+

Factory Data class. Auto-detection of data type if not specified for +input files. Returns the proper data class or specified data class for +the given data or input file.

+
+
Parameters
+
    +
  • input_file_path

  • +
  • data

  • +
  • data_type

  • +
  • options

  • +
+
+
Returns
+

+
+
+
+
+data_classes = [{'data_class': <class 'dataprofiler.data_readers.json_data.JSONData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.csv_data.CSVData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.parquet_data.ParquetData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.avro_data.AVROData'>, 'kwargs': {}}, {'data_class': <class 'dataprofiler.data_readers.text_data.TextData'>, 'kwargs': {}}]
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.data_utils.html b/docs/0.7.1/html/dataprofiler.data_readers.data_utils.html new file mode 100644 index 000000000..66f4e1979 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.data_utils.html @@ -0,0 +1,621 @@ + + + + + + + + + Data Utils - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Utils

+
+
+dataprofiler.data_readers.data_utils.data_generator(data_list)
+

Takes a list and returns a generator on the list.

+
+
Parameters
+

data_list (list) – list of strings

+
+
Returns
+

item from the list

+
+
Return type
+

generator

+
+
+
+
+
+dataprofiler.data_readers.data_utils.generator_on_file(file_object)
+

Takes a file and returns a generator that returns lines

+
+
Parameters
+

file_path (path) – path to the file

+
+
Returns
+

Line from file

+
+
Return type
+

generator

+
+
+
+
+
+dataprofiler.data_readers.data_utils.convert_int_to_string(x)
+

Converts the given input to string. In particular, it is int, +it converts it ensuring there is no . or 00 in the converted string. +In addition, if the input is np.nan, the output will be ‘nan’ which is +what we need to handle data properly.

+
+
Parameters
+

x (Union[int, float, str, numpy.nan]) –

+
+
Returns
+

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.unicode_to_str(data, ignore_dicts=False)
+

Convert data to string representation if it is a unicode string.

+
+
Parameters
+
    +
  • data (str) – input data

  • +
  • ignore_dicts (boolean) – if set, ignore the dictionary type processing

  • +
+
+
Returns
+

string representation of data

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.json_to_dataframe(json_lines, selected_columns=None, read_in_string=False)
+

This function takes a list of json objects and returns the dataframe +representing the json list.

+
+
Parameters
+
    +
  • json_lines (list(dict)) – list of json objects

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

dataframe converted from json list and list of dtypes for each +column

+
+
Return type
+

tuple(pd.DataFrame, pd.Series(dtypes))

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_json_df(data_generator, selected_columns=None, read_in_string=False)
+

This function returns an iterator that returns a chunk of data +as dataframe in each call. The source of input to this function is either a +file or a list of JSON structured strings. If the file path is given as +input, the file is expected to have one JSON structures in each line. The +lines that are not valid json will be ignored. Therefore, a file with +pretty printed JSON objects will not be considered valid JSON. If the +input is a data list, it is expected to be a list of strings where each +string is a valid JSON object. if the individual object is not valid +JSON, it will be ignored.

+

NOTE: both data_list and file_path cannot be passed at the same time.

+
+
Parameters
+
    +
  • data_generator (generator) – The generator you want to read.

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

returns an iterator that returns a chunk of file as dataframe in +each call as well as original dtypes of the dataframe columns.

+
+
Return type
+

typle(Iterator(pd.DataFrame), pd.Series(dtypes)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_json(data_generator, selected_columns=None, read_in_string=False)
+

This function returns the lines of a json. The source of input to this +function is either a file or a list of JSON structured strings. If the +file path is given as input, the file is expected to have one JSON +structures in each line. The lines that are not valid json will be ignored. +Therefore, a file with pretty printed JSON objects will not be considered +valid JSON. If the input is a data list, it is expected to be a list of +strings where each string is a valid JSON object. if the individual object +is not valid JSON, it will be ignored.

+

NOTE: both data_list and file_path cannot be passed at the same time.

+
+
Parameters
+
    +
  • data_generator (generator) – The generator you want to read.

  • +
  • selected_columns (list(str)) – a list of keys to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

returns the lines of a json file

+
+
Return type
+

list(dict)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_csv_df(file_path, delimiter, header, selected_columns=[], read_in_string=False, encoding='utf-8')
+

Reads a CSV file in chunks and returns a dataframe in the form of iterator.

+
+
Parameters
+
    +
  • file_path (str) – path to the CSV file.

  • +
  • delimiter (str) – character used to separate csv values.

  • +
  • header (int) – the header row in the csv file.

  • +
  • selected_columns (list(str)) – a list of columns to be processed

  • +
  • read_in_string (bool) – if True, all the values in dataframe will be +converted to string

  • +
+
+
Returns
+

Iterator

+
+
Return type
+

pd.DataFrame

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_parquet_df(file_path, selected_columns=None, read_in_string=False)
+

Returns an iterator that returns one row group each time.

+
+
Parameters
+

file_path (str) – path to the Parquet file.

+
+
Returns
+

+
+
Return type
+

Iterator(pd.DataFrame)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.read_text_as_list_of_strs(file_path, encoding=None)
+

Returns a list of strings relative to the chunk size. Each line is 1 chunk.

+
+
Parameters
+

file_path (str) – path to the file

+
+
Returns
+

+
+
Return type
+

list(str)

+
+
+
+
+
+dataprofiler.data_readers.data_utils.detect_file_encoding(file_path, buffer_size=1024, max_lines=20)
+

Determines the encoding of files within the initial max_lines of length +buffer_size.

+
+
Parameters
+
    +
  • file_path (str) – path to the file

  • +
  • buffer_size (int) – buffer length for each line being read

  • +
  • max_lines (int) – number of lines to read from file of length buffer_size

  • +
+
+
Returns
+

encoding type

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.data_readers.data_utils.detect_cell_type(cell)
+

Detects the cell type (int, float, etc)

+
+
Parameters
+

cell (str) – String designated for data type detection

+
+
+
+
+
+dataprofiler.data_readers.data_utils.get_delimiter_regex(delimiter=',', quotechar=',')
+

Builds regex for delimiter checks

+
+
Parameters
+
    +
  • delimiter (str) – Delimiter to be added to regex

  • +
  • quotechar – Quotechar to be added to regex

  • +
+
+
+
+
+
+dataprofiler.data_readers.data_utils.find_nth_loc(string=None, search_query=None, n=0)
+

Searches the string via the search_query and +returns the nth index in which the query occurs. +If there are less than ‘n’ the last loc is returned

+
+
Parameters
+
    +
  • string (str) – Input string, to be searched

  • +
  • search_query (str) – char(s) to find nth occurance of

  • +
  • n (int) – The number of occurances to iterate through

  • +
+
+
Return idx
+

Index of the nth or last occurance of the search_query

+
+
Rtype idx
+

int

+
+
Return id_count
+

Number of identifications prior to idx

+
+
Rtype id_count
+

int

+
+
+
+
+
+dataprofiler.data_readers.data_utils.load_as_str_from_file(file_path, file_encoding=None, max_lines=10, max_bytes=65536, chunk_size_bytes=1024)
+

Loads data from a csv file up to a specific line OR byte_size.

+
+
Parameters
+
    +
  • file_path (str) – Path to file to load data from

  • +
  • file_encoding (str) – File encoding

  • +
  • max_lines (int) – Maximum number of lines to load from file

  • +
  • max_bytes (int) – Maximum number of bytes to load from file

  • +
  • chunk_size_bytes (int) – Chunk size to load every data load

  • +
+
+
Return data_as_str
+

Data as string

+
+
+
+
+
+dataprofiler.data_readers.data_utils.is_stream_buffer(filepath_or_buffer)
+

Determines whether a given argument is a filepath or buffer.

+
+
Parameters
+

filepath_or_buffer (str) – path to the file or buffer

+
+
Returns
+

true if string is a buffer or false if string is a filepath

+
+
Return type
+

boolean

+
+
+
+
+
+dataprofiler.data_readers.data_utils.is_valid_url(url_as_string)
+

Determines whether a given string is a valid URL

+
+
Parameters
+

url_as_string (str) – string to be tested if URL

+
+
Returns
+

true if string is a valid URL

+
+
Return type
+

boolean

+
+
+
+
+
+dataprofiler.data_readers.data_utils.url_to_bytes(url_as_string, options)
+

Reads in URL and converts it to a byte stream

+
+
Parameters
+
    +
  • url_as_string (str) – string to read as URL

  • +
  • options (dict) – options for the url

  • +
+
+
Returns
+

BytesIO stream of data downloaded from URL

+
+
Return type
+

BytesIO stream

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.filepath_or_buffer.html b/docs/0.7.1/html/dataprofiler.data_readers.filepath_or_buffer.html new file mode 100644 index 000000000..f1acc75cb --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.filepath_or_buffer.html @@ -0,0 +1,292 @@ + + + + + + + + + Filepath Or Buffer - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Filepath Or Buffer

+
+
+class dataprofiler.data_readers.filepath_or_buffer.FileOrBufferHandler(filepath_or_buffer, open_method='r', encoding=None, seek_offset=None, seek_whence=0)
+

Bases: object

+

FileOrBufferHandler class to read a filepath or buffer in and always +return a readable buffer.

+

Context manager class used for inputting a file or buffer and returning +a structure that is always a buffer.

+
+
Parameters
+
    +
  • filepath_or_buffer (Union[str, StringIO, BytesIO]) – path to the file being loaded or buffer

  • +
  • open_method (string) – value describes the mode the file is opened in

  • +
  • seek_offset (int) – offset from start of the stream

  • +
+
+
Returns
+

TextIOBase or BufferedIOBase class/subclass

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.html b/docs/0.7.1/html/dataprofiler.data_readers.html new file mode 100644 index 000000000..67b638e4f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.html @@ -0,0 +1,312 @@ + + + + + + + + + Data Readers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.json_data.html b/docs/0.7.1/html/dataprofiler.data_readers.json_data.html new file mode 100644 index 000000000..839c92e7f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.json_data.html @@ -0,0 +1,397 @@ + + + + + + + + + JSON Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

JSON Data

+
+
+class dataprofiler.data_readers.json_data.JSONData(input_file_path=None, data=None, options=None)
+

Bases: dataprofiler.data_readers.structured_mixins.SpreadSheetDataMixin, dataprofiler.data_readers.base_data.BaseData

+

SpreadsheetData class to save and load spreadsheet data

+

Data class for loading datasets of type JSON. Can be specified by +passing in memory data or via a file path. Options pertaining the JSON +may also be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "json",
+     "flattened_dataframe"
+    selected_keys= type: list(str)
+    payload_keys= type: Union[str, list(str)]
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +selected_keys: keys being selected from the entire dataset +payload_keys: list of dictionary keys that determine the payload

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = 'json'
+
+
+
+property selected_keys
+
+
+
+property metadata
+

Returns a data frame that contains the metadata

+
+
+
+property data_and_metadata
+

Returns a data frame that joins the data and the metadata.

+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+classmethod is_match(file_path, options=None)
+

Test the first 1000 lines of a given file to check if the file has valid +JSON format or not. At least 60 percent of the lines in the first 1000 +lines have to be valid json.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – json read options

  • +
+
+
Returns
+

is file a json file or not

+
+
Return type
+

bool

+
+
+
+
+
+property data
+
+
+
+property data_format
+
+
+
+property file_encoding
+
+
+
+get_batch_generator(batch_size)
+
+
+
+info = None
+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+reload(input_file_path=None, data=None, options=None)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.parquet_data.html b/docs/0.7.1/html/dataprofiler.data_readers.parquet_data.html new file mode 100644 index 000000000..ae7236f5c --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.parquet_data.html @@ -0,0 +1,385 @@ + + + + + + + + + Parquet Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Parquet Data

+
+
+class dataprofiler.data_readers.parquet_data.ParquetData(input_file_path=None, data=None, options=None)
+

Bases: dataprofiler.data_readers.structured_mixins.SpreadSheetDataMixin, dataprofiler.data_readers.base_data.BaseData

+

SpreadsheetData class to save and load spreadsheet data

+

Data class for loading datasets of type PARQUET. Can be specified by +passing in memory data or via a file path. Options pertaining the +PARQUET may also be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "dataframe", "records", "json"
+    selected_columns= type: list(str)
+    header= type: any
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +selected_columns: columns being selected from the entire dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = 'parquet'
+
+
+
+property file_encoding
+

Set file encoding to None since not detected for avro.

+
+
+
+property selected_columns
+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+classmethod is_match(file_path, options=None)
+

Test the given file to check if the file has valid +Parquet format or not.

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – parquet read options

  • +
+
+
Returns
+

is file a parquet file or not

+
+
Return type
+

bool

+
+
+
+
+
+reload(input_file_path=None, data=None, options=None)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property data
+
+
+
+property data_format
+
+
+
+get_batch_generator(batch_size)
+
+
+
+info = None
+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.structured_mixins.html b/docs/0.7.1/html/dataprofiler.data_readers.structured_mixins.html new file mode 100644 index 000000000..a915c92f3 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.structured_mixins.html @@ -0,0 +1,290 @@ + + + + + + + + + Structured Mixins - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Structured Mixins

+
+
+class dataprofiler.data_readers.structured_mixins.SpreadSheetDataMixin(input_file_path, data, options)
+

Bases: object

+

Mixin data class for loading datasets of type SpreadSheet. Can be specified +Adds specialized functions for loading data from a string or file.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.data_readers.text_data.html b/docs/0.7.1/html/dataprofiler.data_readers.text_data.html new file mode 100644 index 000000000..47d289356 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.data_readers.text_data.html @@ -0,0 +1,386 @@ + + + + + + + + + Text Data - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Text Data

+
+
+class dataprofiler.data_readers.text_data.TextData(input_file_path=None, data=None, options=None)
+

Bases: dataprofiler.data_readers.base_data.BaseData

+

TextData class to save and load text files

+

Data class for loading datasets of type TEXT. Can be specified by +passing in memory data or via a file path. Options pertaining the TEXT +may also be specified using the options dict parameter. +Possible Options:

+
options = dict(
+    data_format= type: str, choices: "text"
+    samples_per_line= type: int
+)
+
+
+

data_format: user selected format in which to return data +can only be of specified types +samples_per_line: chunks by which to read in the specified dataset

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+data_type = 'text'
+
+
+
+property samples_per_line
+
+
+
+property is_structured
+

Determines compatibility with StructuredProfiler

+
+
+
+tokenize()
+
+
+
+classmethod is_match(file_path, options=None)
+

All files can be considered text files, hence returns True

+
+
Parameters
+
    +
  • file_path (str) – path to the file to be examined

  • +
  • options (dict) – text file read options

  • +
+
+
Returns
+

is file a text file or not

+
+
Return type
+

bool

+
+
+
+
+
+reload(input_file_path=None, data=None, options=None)
+

Reload the data class with a new dataset. This erases all existing +data/options and replaces it with the input data/options.

+
+
Parameters
+
    +
  • input_file_path (str) – path to the file being loaded or None

  • +
  • data (multiple types) – data being loaded into the class instead of an input file

  • +
  • options (dict) – options pertaining to the data type

  • +
+
+
Returns
+

None

+
+
+
+
+
+property data
+
+
+
+property data_format
+
+
+
+property file_encoding
+
+
+
+get_batch_generator(batch_size)
+
+
+
+info = None
+
+
+
+property length
+

Returns the length of the dataset which is loaded.

+
+
Returns
+

length of the dataset

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.dp_logging.html b/docs/0.7.1/html/dataprofiler.dp_logging.html new file mode 100644 index 000000000..8abd89283 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.dp_logging.html @@ -0,0 +1,285 @@ + + + + + + + + + Dp Logging - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Dp Logging

+
+
+dataprofiler.dp_logging.get_logger()
+

Access DataProfiler-specific logger

+
+
+
+dataprofiler.dp_logging.set_verbosity(level)
+

Set verbosity level for DataProfiler logger. Must set it to one of +[logging.NOTSET, logging.DEBUG, logging.INFO,

+
+

logging.WARNING, logging.ERROR, logging.CRITICAL]

+
+
+
Parameters
+

level (int) – Verbosity level from logging module

+
+
+
+
+
+dataprofiler.dp_logging.get_child_logger(name)
+

Returns logger for given filepath

+
+
Parameters
+

name (str) – name of file in need of accessing child logger

+
+
Returns
+

Logger instance for given file

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.html b/docs/0.7.1/html/dataprofiler.html new file mode 100644 index 000000000..f5b330889 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.html @@ -0,0 +1,362 @@ + + + + + + + + + Dataprofiler - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+ + +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.base_data_labeler.html b/docs/0.7.1/html/dataprofiler.labelers.base_data_labeler.html new file mode 100644 index 000000000..c3983d91f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.base_data_labeler.html @@ -0,0 +1,869 @@ + + + + + + + + + Base Data Labeler - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Data Labeler

+
+
+class dataprofiler.labelers.base_data_labeler.BaseDataLabeler(dirpath=None, load_options=None)
+

Bases: object

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

Retrieves the label encodings

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property reverse_label_mapping
+

Retrieves the index to label encoding

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+property labels
+

Retrieves the label

+
+
Returns
+

list of labels

+
+
+
+
+
+property preprocessor
+

Retrieves the data preprocessor

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property model
+

Retrieves the data labeler model

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor
+

Retrieves the data postprocessor

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+set_params(params)
+

Allows user to set parameters of pipeline components in the following +format:

+
+
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels)
+

Sets the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+predict(data, batch_size=32, predict_options=None, error_on_mismatch=False, verbose=1)
+

Predicts labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data – data to be predicted upon

  • +
  • batch_size – batch size of prediction

  • +
  • predict_options – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
+
+
+
+set_preprocessor(data_processor)
+

Set the data preprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_model(model)
+

Set the model for the data labeler

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor)
+

Set the data postprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor=False, error_on_mismatch=False)
+

Checks whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

bool indicating valid pipeline

+
+
+
+
+
+classmethod load_from_library(name)
+

Loads the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_from_disk(dirpath, load_options=None)
+

Loads the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_with_components(preprocessor, model, postprocessor)
+

Loads the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves the data labeler to the specified location

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.base_data_labeler.TrainableDataLabeler(dirpath=None, load_options=None)
+

Bases: dataprofiler.labelers.base_data_labeler.BaseDataLabeler

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+fit(x, y, validation_split=0.2, labels=None, reset_weights=False, batch_size=32, epochs=1, error_on_mismatch=False)
+

Fits the data labeler model for the dataset.

+
+
Parameters
+
    +
  • x (Union[pd.DataFrame, pd.Series, np.ndarray]) – samples to fit model

  • +
  • y (Union[pd.DataFrame, pd.Series, np.ndarray]) – labels associated with the samples to fit model

  • +
  • validation_split (float) – split of the data to have as cross-validation +data

  • +
  • labels (Union[list, dict]) – Encoding or number of labels if refit is needed to new +labels

  • +
  • reset_weights (bool) – Flag to determine whether or not to reset the +weights

  • +
  • batch_size (int) – Size of each batch sent to data labeler model

  • +
  • epochs (int) – number of epochs to iterate over the dataset and send to +the model

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

model output

+
+
+
+
+
+set_model(model)
+

Set the model for a trainable data labeler. Model must have a train +function to be able to be set.

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_with_components(preprocessor, model, postprocessor)
+

Loads the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

+
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor=False, error_on_mismatch=False)
+

Checks whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

bool indicating valid pipeline

+
+
+
+
+
+help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

Retrieves the label encodings

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels
+

Retrieves the label

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath, load_options=None)
+

Loads the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_from_library(name)
+

Loads the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
+
+
+
+property model
+

Retrieves the data labeler model

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor
+

Retrieves the data postprocessor

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data, batch_size=32, predict_options=None, error_on_mismatch=False, verbose=1)
+

Predicts labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data – data to be predicted upon

  • +
  • batch_size – batch size of prediction

  • +
  • predict_options – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
+
+
+
+property preprocessor
+

Retrieves the data preprocessor

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping
+

Retrieves the index to label encoding

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves the data labeler to the specified location

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels)
+

Sets the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_params(params)
+

Allows user to set parameters of pipeline components in the following +format:

+
+
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor)
+

Set the data postprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor)
+

Set the data preprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.base_model.html b/docs/0.7.1/html/dataprofiler.labelers.base_model.html new file mode 100644 index 000000000..7a1c3ed25 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.base_model.html @@ -0,0 +1,609 @@ + + + + + + + + + Base Model - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Model

+
+
+class dataprofiler.labelers.base_model.AutoSubRegistrationMeta(clsname, bases, attrs)
+

Bases: abc.ABCMeta

+
+
+mro()
+

Return a type’s method resolution order.

+
+
+
+register(subclass)
+

Register a virtual subclass of an ABC.

+

Returns the subclass, to allow usage as a class decorator.

+
+
+
+
+class dataprofiler.labelers.base_model.BaseModel(label_mapping, parameters)
+

Bases: object

+

Base Model Initializer. Only model and model parameters are stored here +:param parameters: Contains all the appropriate parameters for the model.

+
+

Must contain num_labels.

+
+
+
Returns
+

None

+
+
+
+
+requires_zero_mapping = False
+
+
+
+property label_mapping
+

mapping of labels to their encoded values

+
+
Type
+

return

+
+
+
+
+
+property reverse_label_mapping
+

Reversed order of current labels, useful for when needed to +extract Labels via indices

+
+
Type
+

return

+
+
+
+
+
+property labels
+

Retrieves the label +:return: list of labels

+
+
+
+property num_labels
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list. +:param param_list: list of parameters to retrieve from the model. +:type param_list: list +:return: dict of parameters

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+set_label_mapping(label_mapping)
+

Sets the labels for the model

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+abstract reset_weights()
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+abstract predict(data, batch_size, show_confidences, verbose)
+

Predict the data with the current model +:param data: model input data to predict on +:type data: iterator of data to process +:param batch_size: number of samples in the batch of data +:type batch_size: int +:param show_confidences: whether user wants prediction confidences +:type show_confidences: bool +:param verbose: Flag to determine whether to print status or not +:type verbose: bool +:return: char level predictions and confidences +:rtype: dict

+
+
+
+abstract classmethod load_from_disk(dirpath)
+

Loads whole model from disk with weights +:param dirpath: directory path where you want to load the model from +:type dirpath: str +:return: None

+
+
+
+abstract save_to_disk(dirpath)
+

Saves whole model to disk with weights +:param dirpath: directory path where you want to save the model to +:type dirpath: str +:return: None

+
+
+
+
+class dataprofiler.labelers.base_model.BaseTrainableModel(label_mapping, parameters)
+

Bases: dataprofiler.labelers.base_model.BaseModel

+

Base Model Initializer. Only model and model parameters are stored here +:param parameters: Contains all the appropriate parameters for the model.

+
+

Must contain num_labels.

+
+
+
Returns
+

None

+
+
+
+
+abstract fit(train_data, val_data, batch_size=32, epochs=1, label_mapping=None, reset_weights=False)
+

Train the current model with the training data and validation data +:param train_data: Training data used to train model +:type train_data: Union[pd.DataFrame, pd.Series, np.ndarray] +:param val_data: Validation data used to validate the training +:type val_data: Union[pd.DataFrame, pd.Series, np.ndarray] +:param batch_size: Used to determine number of samples in each batch +:type batch_size: int +:param epochs: Used to determine how many epochs to run +:type epochs: int +:param label_mapping: Mapping of the labels +:type label_mapping: dict +:param reset_weights: Flag to determine whether or not to reset the

+
+

model’s weights

+
+
+
Returns
+

None

+
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list. +:param param_list: list of parameters to retrieve from the model. +:type param_list: list +:return: dict of parameters

+
+
+
+classmethod help()
+

Help function describing alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

mapping of labels to their encoded values

+
+
Type
+

return

+
+
+
+
+
+property labels
+

Retrieves the label +:return: list of labels

+
+
+
+abstract classmethod load_from_disk(dirpath)
+

Loads whole model from disk with weights +:param dirpath: directory path where you want to load the model from +:type dirpath: str +:return: None

+
+
+
+property num_labels
+
+
+
+abstract predict(data, batch_size, show_confidences, verbose)
+

Predict the data with the current model +:param data: model input data to predict on +:type data: iterator of data to process +:param batch_size: number of samples in the batch of data +:type batch_size: int +:param show_confidences: whether user wants prediction confidences +:type show_confidences: bool +:param verbose: Flag to determine whether to print status or not +:type verbose: bool +:return: char level predictions and confidences +:rtype: dict

+
+
+
+requires_zero_mapping = False
+
+
+
+abstract reset_weights()
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+property reverse_label_mapping
+

Reversed order of current labels, useful for when needed to +extract Labels via indices

+
+
Type
+

return

+
+
+
+
+
+abstract save_to_disk(dirpath)
+

Saves whole model to disk with weights +:param dirpath: directory path where you want to save the model to +:type dirpath: str +:return: None

+
+
+
+set_label_mapping(label_mapping)
+

Sets the labels for the model

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.character_level_cnn_model.html b/docs/0.7.1/html/dataprofiler.labelers.character_level_cnn_model.html new file mode 100644 index 000000000..d5e3bf1ab --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.character_level_cnn_model.html @@ -0,0 +1,2637 @@ + + + + + + + + + Character Level Cnn Model - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Character Level Cnn Model

+
+
+class dataprofiler.labelers.character_level_cnn_model.NoV1ResourceMessageFilter(name='')
+

Bases: logging.Filter

+

Removes TF2 warning for using TF1 model which has resources.

+

Initialize a filter.

+

Initialize with the name of the logger which, together with its +children, will have its events allowed through the filter. If no +name is specified, allow every event.

+
+
+filter(record)
+

Determine if the specified record is to be logged.

+

Returns True if the record should be logged, or False otherwise. +If deemed appropriate, the record may be modified in-place.

+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.FBetaScore(*args, **kwargs)
+

Bases: tensorflow.python.keras.metrics.Metric

+

Computes F-Beta score. +Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283

+

# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==============================================================================

+

It is the weighted harmonic mean of precision +and recall. Output range is [0, 1]. Works for +both multi-class and multi-label classification. +$$ +F_{beta} = (1 + beta^2) * frac{textrm{precision} * textrm{precision}}{(beta^2 cdot textrm{precision}) + textrm{recall}} +$$ +:param num_classes: Number of unique classes in the dataset. +:param average: Type of averaging to be performed on data.

+
+

Acceptable values are None, micro, macro and +weighted. Default value is None.

+
+
+
Parameters
+
    +
  • beta – Determines the weight of precision and recall +in harmonic mean. Determines the weight given to the +precision and recall. Default value is 1.

  • +
  • threshold – Elements of y_pred greater than threshold are +converted to be 1, and the rest 0. If threshold is +None, the argmax is converted to 1, and the rest 0.

  • +
  • name – (Optional) String name of the metric instance.

  • +
  • dtype – (Optional) Data type of the metric result.

  • +
+
+
Returns
+

float.

+
+
Return type
+

F-Beta Score

+
+
+
+
+update_state(y_true, y_pred, sample_weight=None)
+

Accumulates statistics for the metric.

+

Note: This function is executed as a graph function in graph mode. +This means:

+
+
    +
  1. Operations on the same resource are executed in textual order. +This should make it easier to do things like add the updated +value of a variable to another, for example.

  2. +
  3. You don’t need to worry about collecting the update ops to execute. +All update ops added to the graph by this function will be executed.

  4. +
+

As a result, code should generally work the same way with graph or +eager execution.

+
+
+
Parameters
+
    +
  • *args

  • +
  • **kwargs – A mini-batch of inputs to the Metric.

  • +
+
+
+
+
+
+result()
+

Computes and returns the metric value tensor.

+

Result computation is an idempotent operation that simply calculates the +metric value using the state variables.

+
+
+
+get_config()
+

Returns the serializable config of the metric.

+
+
+
+reset_states()
+
+
+
+property activity_regularizer
+

Optional regularizer function for the output of this layer.

+
+
+
+add_loss(losses, **kwargs)
+

Add loss tensor(s), potentially dependent on layer inputs.

+

Some losses (for instance, activity regularization losses) may be dependent +on the inputs passed when calling a layer. Hence, when reusing the same +layer on different inputs a and b, some entries in layer.losses may +be dependent on a and some on b. This method automatically keeps track +of dependencies.

+

This method can be used inside a subclassed layer or model’s call +function, in which case losses should be a Tensor or list of Tensors.

+

Example:

+

```python +class MyLayer(tf.keras.layers.Layer):

+
+
+
def call(self, inputs):

self.add_loss(tf.abs(tf.reduce_mean(inputs))) +return inputs

+
+
+
+

```

+

This method can also be called directly on a Functional Model during +construction. In this case, any loss Tensors passed to this Model must +be symbolic and be able to be traced back to the model’s Input`s. These +losses become part of the model’s topology and are tracked in `get_config.

+

Example:

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +# Activity regularization. +model.add_loss(tf.abs(tf.reduce_mean(x))) +`

+

If this is not the case for your loss (if, for example, your loss references +a Variable of one of the model’s layers), you can wrap your loss in a +zero-argument lambda. These losses are not tracked as part of the model’s +topology since they can’t be serialized.

+

Example:

+

`python +inputs = tf.keras.Input(shape=(10,)) +d = tf.keras.layers.Dense(10) +x = d(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +# Weight regularization. +model.add_loss(lambda: tf.reduce_mean(d.kernel)) +`

+
+
Parameters
+
    +
  • losses – Loss tensor, or list/tuple of tensors. Rather than tensors, losses +may also be zero-argument callables which create a loss tensor.

  • +
  • **kwargs

    Additional keyword arguments for backward compatibility. +Accepted values:

    +
    +

    inputs - Deprecated, will be automatically inferred.

    +
    +

  • +
+
+
+
+
+
+add_metric(value, name=None, **kwargs)
+

Adds metric tensor to the layer.

+

This method can be used inside the call() method of a subclassed layer +or model.

+

```python +class MyMetricLayer(tf.keras.layers.Layer):

+
+
+
def __init__(self):

super(MyMetricLayer, self).__init__(name=’my_metric_layer’) +self.mean = tf.keras.metrics.Mean(name=’metric_1’)

+
+
def call(self, inputs):

self.add_metric(self.mean(inputs)) +self.add_metric(tf.reduce_sum(inputs), name=’metric_2’) +return inputs

+
+
+
+

```

+

This method can also be called directly on a Functional Model during +construction. In this case, any tensor passed to this Model must +be symbolic and be able to be traced back to the model’s Input`s. These +metrics become part of the model’s topology and are tracked when you +save the model via `save().

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +model.add_metric(math_ops.reduce_sum(x), name='metric_1') +`

+

Note: Calling add_metric() with the result of a metric object on a +Functional Model, as shown in the example below, is not supported. This is +because we cannot trace the metric result tensor back to the model’s inputs.

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +model.add_metric(tf.keras.metrics.Mean()(x), name='metric_1') +`

+
+
Parameters
+
    +
  • value – Metric tensor.

  • +
  • name – String metric name.

  • +
  • **kwargs – Additional keyword arguments for backward compatibility. +Accepted values: +aggregation - When the value tensor provided is not the result of +calling a keras.Metric instance, it will be aggregated by default +using a keras.Metric.Mean.

  • +
+
+
+
+
+
+add_update(updates, inputs=None)
+

Add update op(s), potentially dependent on layer inputs.

+

Weight updates (for instance, the updates of the moving mean and variance +in a BatchNormalization layer) may be dependent on the inputs passed +when calling a layer. Hence, when reusing the same layer on +different inputs a and b, some entries in layer.updates may be +dependent on a and some on b. This method automatically keeps track +of dependencies.

+

This call is ignored when eager execution is enabled (in that case, variable +updates are run on the fly and thus do not need to be tracked for later +execution).

+
+
Parameters
+
    +
  • updates – Update op, or list/tuple of update ops, or zero-arg callable +that returns an update op. A zero-arg callable should be passed in +order to disable running the updates by setting trainable=False +on this Layer, when executing in Eager mode.

  • +
  • inputs – Deprecated, will be automatically inferred.

  • +
+
+
+
+
+
+add_variable(*args, **kwargs)
+

Deprecated, do NOT use! Alias for add_weight.

+
+
+
+add_weight(name, shape=(), aggregation=VariableAggregation.SUM, synchronization=VariableSynchronization.ON_READ, initializer=None, dtype=None)
+

Adds state variable. Only for use by subclasses.

+
+
+
+apply(inputs, *args, **kwargs)
+

Deprecated, do NOT use!

+

This is an alias of self.__call__.

+
+
Parameters
+
    +
  • inputs – Input tensor(s).

  • +
  • *args – additional positional arguments to be passed to self.call.

  • +
  • **kwargs – additional keyword arguments to be passed to self.call.

  • +
+
+
Returns
+

Output tensor(s).

+
+
+
+
+
+build(input_shape)
+

Creates the variables of the layer (optional, for subclass implementers).

+

This is a method that implementers of subclasses of Layer or Model +can override if they need a state-creation step in-between +layer instantiation and layer call.

+

This is typically used to create the weights of Layer subclasses.

+
+
Parameters
+

input_shape – Instance of TensorShape, or list of instances of +TensorShape if the layer expects a list of inputs +(one instance per input).

+
+
+
+
+
+call(inputs, *args, **kwargs)
+

This is where the layer’s logic lives.

+

Note here that call() method in tf.keras is little bit different +from keras API. In keras API, you can pass support masking for +layers as additional arguments. Whereas tf.keras has compute_mask() +method to support masking.

+
+
Parameters
+
    +
  • inputs – Input tensor, or list/tuple of input tensors.

  • +
  • *args – Additional positional arguments. Currently unused.

  • +
  • **kwargs – Additional keyword arguments. Currently unused.

  • +
+
+
Returns
+

A tensor or list/tuple of tensors.

+
+
+
+
+
+property compute_dtype
+

The dtype of the layer’s computations.

+

This is equivalent to Layer.dtype_policy.compute_dtype. Unless +mixed precision is used, this is the same as Layer.dtype, the dtype of +the weights.

+

Layers automatically cast their inputs to the compute dtype, which causes +computations and the output to be in the compute dtype as well. This is done +by the base Layer class in Layer.__call__, so you do not have to insert +these casts if implementing your own layer.

+

Layers often perform certain internal computations in higher precision when +compute_dtype is float16 or bfloat16 for numeric stability. The output +will still typically be float16 or bfloat16 in such cases.

+
+
Returns
+

The layer’s compute dtype.

+
+
+
+
+
+compute_mask(inputs, mask=None)
+

Computes an output mask tensor.

+
+
Parameters
+
    +
  • inputs – Tensor or list of tensors.

  • +
  • mask – Tensor or list of tensors.

  • +
+
+
Returns
+

+
None or a tensor (or list of tensors,

one per output tensor of the layer).

+
+
+

+
+
+
+
+
+compute_output_shape(input_shape)
+

Computes the output shape of the layer.

+

If the layer has not been built, this method will call build on the +layer. This assumes that the layer will later be used with inputs that +match the input shape provided here.

+
+
Parameters
+

input_shape – Shape tuple (tuple of integers) +or list of shape tuples (one per output tensor of the layer). +Shape tuples can include None for free dimensions, +instead of an integer.

+
+
Returns
+

An input shape tuple.

+
+
+
+
+
+compute_output_signature(input_signature)
+

Compute the output tensor signature of the layer based on the inputs.

+

Unlike a TensorShape object, a TensorSpec object contains both shape +and dtype information for a tensor. This method allows layers to provide +output dtype information if it is different from the input dtype. +For any layer that doesn’t implement this function, +the framework will fall back to use compute_output_shape, and will +assume that the output dtype matches the input dtype.

+
+
Parameters
+

input_signature – Single TensorSpec or nested structure of TensorSpec +objects, describing a candidate input for the layer.

+
+
Returns
+

+
Single TensorSpec or nested structure of TensorSpec objects, describing

how the layer would transform the provided input.

+
+
+

+
+
Raises
+

TypeError – If input_signature contains a non-TensorSpec object.

+
+
+
+
+
+count_params()
+

Count the total number of scalars composing the weights.

+
+
Returns
+

An integer count.

+
+
Raises
+

ValueError – if the layer isn’t yet built + (in which case its weights aren’t yet defined).

+
+
+
+
+
+property dtype
+

The dtype of the layer weights.

+

This is equivalent to Layer.dtype_policy.variable_dtype. Unless +mixed precision is used, this is the same as Layer.compute_dtype, the +dtype of the layer’s computations.

+
+
+
+property dtype_policy
+

The dtype policy associated with this layer.

+

This is an instance of a tf.keras.mixed_precision.Policy.

+
+
+
+property dynamic
+

Whether the layer is dynamic (eager-only); set in the constructor.

+
+
+
+classmethod from_config(config)
+

Creates a layer from its config.

+

This method is the reverse of get_config, +capable of instantiating the same layer from the config +dictionary. It does not handle layer connectivity +(handled by Network), nor weights (handled by set_weights).

+
+
Parameters
+

config – A Python dictionary, typically the +output of get_config.

+
+
Returns
+

A layer instance.

+
+
+
+
+
+get_input_at(node_index)
+

Retrieves the input tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first input node of the layer.

+
+
Returns
+

A tensor (or list of tensors if the layer has multiple inputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_input_mask_at(node_index)
+

Retrieves the input mask tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A mask tensor +(or list of tensors if the layer has multiple inputs).

+
+
+
+
+
+get_input_shape_at(node_index)
+

Retrieves the input shape(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A shape tuple +(or list of shape tuples if the layer has multiple inputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_losses_for(inputs)
+

Deprecated, do NOT use!

+

Retrieves losses relevant to a specific set of inputs.

+
+
Parameters
+

inputs – Input tensor or list/tuple of input tensors.

+
+
Returns
+

List of loss tensors of the layer that depend on inputs.

+
+
+
+
+
+get_output_at(node_index)
+

Retrieves the output tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first output node of the layer.

+
+
Returns
+

A tensor (or list of tensors if the layer has multiple outputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_output_mask_at(node_index)
+

Retrieves the output mask tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A mask tensor +(or list of tensors if the layer has multiple outputs).

+
+
+
+
+
+get_output_shape_at(node_index)
+

Retrieves the output shape(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A shape tuple +(or list of shape tuples if the layer has multiple outputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_updates_for(inputs)
+

Deprecated, do NOT use!

+

Retrieves updates relevant to a specific set of inputs.

+
+
Parameters
+

inputs – Input tensor or list/tuple of input tensors.

+
+
Returns
+

List of update ops of the layer that depend on inputs.

+
+
+
+
+
+get_weights()
+

Returns the current weights of the layer, as NumPy arrays.

+

The weights of a layer represent the state of the layer. This function +returns both trainable and non-trainable weight values associated with this +layer as a list of NumPy arrays, which can in turn be used to load state +into similarly parameterized layers.

+

For example, a Dense layer returns a list of two values: the kernel matrix +and the bias vector. These can be used to set the weights of another +Dense layer:

+
>>> layer_a = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(1.))
+>>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+>>> layer_a.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(2.))
+>>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+>>> layer_b.get_weights()
+[array([[2.],
+       [2.],
+       [2.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b.set_weights(layer_a.get_weights())
+>>> layer_b.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+
+
+
+
Returns
+

Weights values as a list of NumPy arrays.

+
+
+
+
+
+property inbound_nodes
+

Deprecated, do NOT use! Only for compatibility with external Keras.

+
+
+
+property input
+

Retrieves the input tensor(s) of a layer.

+

Only applicable if the layer has exactly one input, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Input tensor or list of input tensors.

+
+
Raises
+
    +
  • RuntimeError – If called in Eager mode.

  • +
  • AttributeError – If no inbound nodes are found.

  • +
+
+
+
+
+
+property input_mask
+

Retrieves the input mask tensor(s) of a layer.

+

Only applicable if the layer has exactly one inbound node, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Input mask tensor (potentially None) or list of input +mask tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to

  • +
  • more than one incoming layers.

  • +
+
+
+
+
+
+property input_shape
+

Retrieves the input shape(s) of a layer.

+

Only applicable if the layer has exactly one input, +i.e. if it is connected to one incoming layer, or if all inputs +have the same shape.

+
+
Returns
+

Input shape, as an integer shape tuple +(or list of shape tuples, one tuple per input tensor).

+
+
Raises
+
    +
  • AttributeError – if the layer has no defined input_shape.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+property input_spec
+

InputSpec instance(s) describing the input format for this layer.

+

When you create a layer subclass, you can set self.input_spec to enable +the layer to run input compatibility checks when it is called. +Consider a Conv2D layer: it can only be called on a single input tensor +of rank 4. As such, you can set, in __init__():

+

`python +self.input_spec = tf.keras.layers.InputSpec(ndim=4) +`

+

Now, if you try to call the layer on an input that isn’t rank 4 +(for instance, an input of shape (2,), it will raise a nicely-formatted +error:

+

` +ValueError: Input 0 of layer conv2d is incompatible with the layer: +expected ndim=4, found ndim=1. Full shape received: [2] +`

+

Input checks that can be specified via input_spec include: +- Structure (e.g. a single input, a list of 2 inputs, etc) +- Shape +- Rank (ndim) +- Dtype

+

For more information, see tf.keras.layers.InputSpec.

+
+
Returns
+

A tf.keras.layers.InputSpec instance, or nested structure thereof.

+
+
+
+
+
+property losses
+

List of losses added using the add_loss() API.

+

Variable regularization tensors are created when this property is accessed, +so it is eager safe: accessing losses under a tf.GradientTape will +propagate gradients back to the corresponding variables.

+

Examples:

+
>>> class MyLayer(tf.keras.layers.Layer):
+...   def call(self, inputs):
+...     self.add_loss(tf.abs(tf.reduce_mean(inputs)))
+...     return inputs
+>>> l = MyLayer()
+>>> l(np.ones((10, 1)))
+>>> l.losses
+[1.0]
+
+
+
>>> inputs = tf.keras.Input(shape=(10,))
+>>> x = tf.keras.layers.Dense(10)(inputs)
+>>> outputs = tf.keras.layers.Dense(1)(x)
+>>> model = tf.keras.Model(inputs, outputs)
+>>> # Activity regularization.
+>>> len(model.losses)
+0
+>>> model.add_loss(tf.abs(tf.reduce_mean(x)))
+>>> len(model.losses)
+1
+
+
+
>>> inputs = tf.keras.Input(shape=(10,))
+>>> d = tf.keras.layers.Dense(10, kernel_initializer='ones')
+>>> x = d(inputs)
+>>> outputs = tf.keras.layers.Dense(1)(x)
+>>> model = tf.keras.Model(inputs, outputs)
+>>> # Weight regularization.
+>>> model.add_loss(lambda: tf.reduce_mean(d.kernel))
+>>> model.losses
+[<tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
+
+
+
+
Returns
+

A list of tensors.

+
+
+
+
+
+property metrics
+

List of metrics added using the add_metric() API.

+

Example:

+
>>> input = tf.keras.layers.Input(shape=(3,))
+>>> d = tf.keras.layers.Dense(2)
+>>> output = d(input)
+>>> d.add_metric(tf.reduce_max(output), name='max')
+>>> d.add_metric(tf.reduce_min(output), name='min')
+>>> [m.name for m in d.metrics]
+['max', 'min']
+
+
+
+
Returns
+

A list of Metric objects.

+
+
+
+
+
+property name
+

Name of the layer (string), set in the constructor.

+
+
+
+property name_scope
+

Returns a tf.name_scope instance for this class.

+
+
+
+property non_trainable_variables
+

Sequence of non-trainable variables owned by this module and its submodules.

+

Note: this method uses reflection to find variables on the current instance +and submodules. For performance reasons you may wish to cache the result +of calling this method if you don’t expect the return value to change.

+
+
Returns
+

A sequence of variables for the current module (sorted by attribute +name) followed by variables from all submodules recursively (breadth +first).

+
+
+
+
+
+property non_trainable_weights
+

List of all non-trainable weights tracked by this layer.

+

Non-trainable weights are not updated during training. They are expected +to be updated manually in call().

+
+
Returns
+

A list of non-trainable variables.

+
+
+
+
+
+property outbound_nodes
+

Deprecated, do NOT use! Only for compatibility with external Keras.

+
+
+
+property output
+

Retrieves the output tensor(s) of a layer.

+

Only applicable if the layer has exactly one output, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Output tensor or list of output tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to more than one incoming + layers.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+property output_mask
+

Retrieves the output mask tensor(s) of a layer.

+

Only applicable if the layer has exactly one inbound node, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Output mask tensor (potentially None) or list of output +mask tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to

  • +
  • more than one incoming layers.

  • +
+
+
+
+
+
+property output_shape
+

Retrieves the output shape(s) of a layer.

+

Only applicable if the layer has one output, +or if all outputs have the same shape.

+
+
Returns
+

Output shape, as an integer shape tuple +(or list of shape tuples, one tuple per output tensor).

+
+
Raises
+
    +
  • AttributeError – if the layer has no defined output shape.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+reset_state()
+

Resets all of the metric state variables.

+

This function is called between epochs/steps, +when a metric is evaluated during training.

+
+
+
+set_weights(weights)
+

Sets the weights of the layer, from NumPy arrays.

+

The weights of a layer represent the state of the layer. This function +sets the weight values from numpy arrays. The weight values should be +passed in the order they are created by the layer. Note that the layer’s +weights must be instantiated before calling this function, by calling +the layer.

+

For example, a Dense layer returns a list of two values: the kernel matrix +and the bias vector. These can be used to set the weights of another +Dense layer:

+
>>> layer_a = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(1.))
+>>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+>>> layer_a.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(2.))
+>>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+>>> layer_b.get_weights()
+[array([[2.],
+       [2.],
+       [2.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b.set_weights(layer_a.get_weights())
+>>> layer_b.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+
+
+
+
Parameters
+

weights – a list of NumPy arrays. The number +of arrays and their shape must match +number of the dimensions of the weights +of the layer (i.e. it should match the +output of get_weights).

+
+
Raises
+

ValueError – If the provided weights list does not match the + layer’s specifications.

+
+
+
+
+
+property stateful
+
+
+
+property submodules
+

Sequence of all sub-modules.

+

Submodules are modules which are properties of this module, or found as +properties of modules which are properties of this module (and so on).

+
>>> a = tf.Module()
+>>> b = tf.Module()
+>>> c = tf.Module()
+>>> a.b = b
+>>> b.c = c
+>>> list(a.submodules) == [b, c]
+True
+>>> list(b.submodules) == [c]
+True
+>>> list(c.submodules) == []
+True
+
+
+
+
Returns
+

A sequence of all submodules.

+
+
+
+
+
+property supports_masking
+

Whether this layer supports computing a mask using compute_mask.

+
+
+
+property trainable
+
+
+
+property trainable_variables
+

Sequence of trainable variables owned by this module and its submodules.

+

Note: this method uses reflection to find variables on the current instance +and submodules. For performance reasons you may wish to cache the result +of calling this method if you don’t expect the return value to change.

+
+
Returns
+

A sequence of variables for the current module (sorted by attribute +name) followed by variables from all submodules recursively (breadth +first).

+
+
+
+
+
+property trainable_weights
+

List of all trainable weights tracked by this layer.

+

Trainable weights are updated via gradient descent during training.

+
+
Returns
+

A list of trainable variables.

+
+
+
+
+
+property updates
+
+
+
+property variable_dtype
+

Alias of Layer.dtype, the dtype of the weights.

+
+
+
+property variables
+

Returns the list of all layer variables/weights.

+

Alias of self.weights.

+

Note: This will not track the weights of nested tf.Modules that are not +themselves Keras layers.

+
+
Returns
+

A list of variables.

+
+
+
+
+
+property weights
+

Returns the list of all layer variables/weights.

+
+
Returns
+

A list of variables.

+
+
+
+
+
+classmethod with_name_scope(method)
+

Decorator to automatically enter the module name scope.

+
>>> class MyModule(tf.Module):
+...   @tf.Module.with_name_scope
+...   def __call__(self, x):
+...     if not hasattr(self, 'w'):
+...       self.w = tf.Variable(tf.random.normal([x.shape[1], 3]))
+...     return tf.matmul(x, self.w)
+
+
+

Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose +names included the module name:

+
>>> mod = MyModule()
+>>> mod(tf.ones([1, 2]))
+<tf.Tensor: shape=(1, 3), dtype=float32, numpy=..., dtype=float32)>
+>>> mod.w
+<tf.Variable 'my_module/Variable:0' shape=(2, 3) dtype=float32,
+numpy=..., dtype=float32)>
+
+
+
+
Parameters
+

method – The method to wrap.

+
+
Returns
+

The original method wrapped such that it enters the module’s name scope.

+
+
+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.F1Score(*args, **kwargs)
+

Bases: dataprofiler.labelers.character_level_cnn_model.FBetaScore

+

Computes F-1 Score.

+

# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ==============================================================================

+

It is the harmonic mean of precision and recall. +Output range is [0, 1]. Works for both multi-class +and multi-label classification. +$$ +F_1 = 2 cdot frac{textrm{precision} cdot textrm{recall}}{textrm{precision} + textrm{recall}} +$$ +:param num_classes: Number of unique classes in the dataset. +:param average: Type of averaging to be performed on data.

+
+

Acceptable values are None, micro, macro +and weighted. Default value is None.

+
+
+
Parameters
+
    +
  • threshold – Elements of y_pred above threshold are +considered to be 1, and the rest 0. If threshold is +None, the argmax is converted to 1, and the rest 0.

  • +
  • name – (Optional) String name of the metric instance.

  • +
  • dtype – (Optional) Data type of the metric result.

  • +
+
+
Returns
+

float.

+
+
Return type
+

F-1 Score

+
+
+
+
+get_config()
+

Returns the serializable config of the metric.

+
+
+
+property activity_regularizer
+

Optional regularizer function for the output of this layer.

+
+
+
+add_loss(losses, **kwargs)
+

Add loss tensor(s), potentially dependent on layer inputs.

+

Some losses (for instance, activity regularization losses) may be dependent +on the inputs passed when calling a layer. Hence, when reusing the same +layer on different inputs a and b, some entries in layer.losses may +be dependent on a and some on b. This method automatically keeps track +of dependencies.

+

This method can be used inside a subclassed layer or model’s call +function, in which case losses should be a Tensor or list of Tensors.

+

Example:

+

```python +class MyLayer(tf.keras.layers.Layer):

+
+
+
def call(self, inputs):

self.add_loss(tf.abs(tf.reduce_mean(inputs))) +return inputs

+
+
+
+

```

+

This method can also be called directly on a Functional Model during +construction. In this case, any loss Tensors passed to this Model must +be symbolic and be able to be traced back to the model’s Input`s. These +losses become part of the model’s topology and are tracked in `get_config.

+

Example:

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +# Activity regularization. +model.add_loss(tf.abs(tf.reduce_mean(x))) +`

+

If this is not the case for your loss (if, for example, your loss references +a Variable of one of the model’s layers), you can wrap your loss in a +zero-argument lambda. These losses are not tracked as part of the model’s +topology since they can’t be serialized.

+

Example:

+

`python +inputs = tf.keras.Input(shape=(10,)) +d = tf.keras.layers.Dense(10) +x = d(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +# Weight regularization. +model.add_loss(lambda: tf.reduce_mean(d.kernel)) +`

+
+
Parameters
+
    +
  • losses – Loss tensor, or list/tuple of tensors. Rather than tensors, losses +may also be zero-argument callables which create a loss tensor.

  • +
  • **kwargs

    Additional keyword arguments for backward compatibility. +Accepted values:

    +
    +

    inputs - Deprecated, will be automatically inferred.

    +
    +

  • +
+
+
+
+
+
+add_metric(value, name=None, **kwargs)
+

Adds metric tensor to the layer.

+

This method can be used inside the call() method of a subclassed layer +or model.

+

```python +class MyMetricLayer(tf.keras.layers.Layer):

+
+
+
def __init__(self):

super(MyMetricLayer, self).__init__(name=’my_metric_layer’) +self.mean = tf.keras.metrics.Mean(name=’metric_1’)

+
+
def call(self, inputs):

self.add_metric(self.mean(inputs)) +self.add_metric(tf.reduce_sum(inputs), name=’metric_2’) +return inputs

+
+
+
+

```

+

This method can also be called directly on a Functional Model during +construction. In this case, any tensor passed to this Model must +be symbolic and be able to be traced back to the model’s Input`s. These +metrics become part of the model’s topology and are tracked when you +save the model via `save().

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +model.add_metric(math_ops.reduce_sum(x), name='metric_1') +`

+

Note: Calling add_metric() with the result of a metric object on a +Functional Model, as shown in the example below, is not supported. This is +because we cannot trace the metric result tensor back to the model’s inputs.

+

`python +inputs = tf.keras.Input(shape=(10,)) +x = tf.keras.layers.Dense(10)(inputs) +outputs = tf.keras.layers.Dense(1)(x) +model = tf.keras.Model(inputs, outputs) +model.add_metric(tf.keras.metrics.Mean()(x), name='metric_1') +`

+
+
Parameters
+
    +
  • value – Metric tensor.

  • +
  • name – String metric name.

  • +
  • **kwargs – Additional keyword arguments for backward compatibility. +Accepted values: +aggregation - When the value tensor provided is not the result of +calling a keras.Metric instance, it will be aggregated by default +using a keras.Metric.Mean.

  • +
+
+
+
+
+
+add_update(updates, inputs=None)
+

Add update op(s), potentially dependent on layer inputs.

+

Weight updates (for instance, the updates of the moving mean and variance +in a BatchNormalization layer) may be dependent on the inputs passed +when calling a layer. Hence, when reusing the same layer on +different inputs a and b, some entries in layer.updates may be +dependent on a and some on b. This method automatically keeps track +of dependencies.

+

This call is ignored when eager execution is enabled (in that case, variable +updates are run on the fly and thus do not need to be tracked for later +execution).

+
+
Parameters
+
    +
  • updates – Update op, or list/tuple of update ops, or zero-arg callable +that returns an update op. A zero-arg callable should be passed in +order to disable running the updates by setting trainable=False +on this Layer, when executing in Eager mode.

  • +
  • inputs – Deprecated, will be automatically inferred.

  • +
+
+
+
+
+
+add_variable(*args, **kwargs)
+

Deprecated, do NOT use! Alias for add_weight.

+
+
+
+add_weight(name, shape=(), aggregation=VariableAggregation.SUM, synchronization=VariableSynchronization.ON_READ, initializer=None, dtype=None)
+

Adds state variable. Only for use by subclasses.

+
+
+
+apply(inputs, *args, **kwargs)
+

Deprecated, do NOT use!

+

This is an alias of self.__call__.

+
+
Parameters
+
    +
  • inputs – Input tensor(s).

  • +
  • *args – additional positional arguments to be passed to self.call.

  • +
  • **kwargs – additional keyword arguments to be passed to self.call.

  • +
+
+
Returns
+

Output tensor(s).

+
+
+
+
+
+build(input_shape)
+

Creates the variables of the layer (optional, for subclass implementers).

+

This is a method that implementers of subclasses of Layer or Model +can override if they need a state-creation step in-between +layer instantiation and layer call.

+

This is typically used to create the weights of Layer subclasses.

+
+
Parameters
+

input_shape – Instance of TensorShape, or list of instances of +TensorShape if the layer expects a list of inputs +(one instance per input).

+
+
+
+
+
+call(inputs, *args, **kwargs)
+

This is where the layer’s logic lives.

+

Note here that call() method in tf.keras is little bit different +from keras API. In keras API, you can pass support masking for +layers as additional arguments. Whereas tf.keras has compute_mask() +method to support masking.

+
+
Parameters
+
    +
  • inputs – Input tensor, or list/tuple of input tensors.

  • +
  • *args – Additional positional arguments. Currently unused.

  • +
  • **kwargs – Additional keyword arguments. Currently unused.

  • +
+
+
Returns
+

A tensor or list/tuple of tensors.

+
+
+
+
+
+property compute_dtype
+

The dtype of the layer’s computations.

+

This is equivalent to Layer.dtype_policy.compute_dtype. Unless +mixed precision is used, this is the same as Layer.dtype, the dtype of +the weights.

+

Layers automatically cast their inputs to the compute dtype, which causes +computations and the output to be in the compute dtype as well. This is done +by the base Layer class in Layer.__call__, so you do not have to insert +these casts if implementing your own layer.

+

Layers often perform certain internal computations in higher precision when +compute_dtype is float16 or bfloat16 for numeric stability. The output +will still typically be float16 or bfloat16 in such cases.

+
+
Returns
+

The layer’s compute dtype.

+
+
+
+
+
+compute_mask(inputs, mask=None)
+

Computes an output mask tensor.

+
+
Parameters
+
    +
  • inputs – Tensor or list of tensors.

  • +
  • mask – Tensor or list of tensors.

  • +
+
+
Returns
+

+
None or a tensor (or list of tensors,

one per output tensor of the layer).

+
+
+

+
+
+
+
+
+compute_output_shape(input_shape)
+

Computes the output shape of the layer.

+

If the layer has not been built, this method will call build on the +layer. This assumes that the layer will later be used with inputs that +match the input shape provided here.

+
+
Parameters
+

input_shape – Shape tuple (tuple of integers) +or list of shape tuples (one per output tensor of the layer). +Shape tuples can include None for free dimensions, +instead of an integer.

+
+
Returns
+

An input shape tuple.

+
+
+
+
+
+compute_output_signature(input_signature)
+

Compute the output tensor signature of the layer based on the inputs.

+

Unlike a TensorShape object, a TensorSpec object contains both shape +and dtype information for a tensor. This method allows layers to provide +output dtype information if it is different from the input dtype. +For any layer that doesn’t implement this function, +the framework will fall back to use compute_output_shape, and will +assume that the output dtype matches the input dtype.

+
+
Parameters
+

input_signature – Single TensorSpec or nested structure of TensorSpec +objects, describing a candidate input for the layer.

+
+
Returns
+

+
Single TensorSpec or nested structure of TensorSpec objects, describing

how the layer would transform the provided input.

+
+
+

+
+
Raises
+

TypeError – If input_signature contains a non-TensorSpec object.

+
+
+
+
+
+count_params()
+

Count the total number of scalars composing the weights.

+
+
Returns
+

An integer count.

+
+
Raises
+

ValueError – if the layer isn’t yet built + (in which case its weights aren’t yet defined).

+
+
+
+
+
+property dtype
+

The dtype of the layer weights.

+

This is equivalent to Layer.dtype_policy.variable_dtype. Unless +mixed precision is used, this is the same as Layer.compute_dtype, the +dtype of the layer’s computations.

+
+
+
+property dtype_policy
+

The dtype policy associated with this layer.

+

This is an instance of a tf.keras.mixed_precision.Policy.

+
+
+
+property dynamic
+

Whether the layer is dynamic (eager-only); set in the constructor.

+
+
+
+classmethod from_config(config)
+

Creates a layer from its config.

+

This method is the reverse of get_config, +capable of instantiating the same layer from the config +dictionary. It does not handle layer connectivity +(handled by Network), nor weights (handled by set_weights).

+
+
Parameters
+

config – A Python dictionary, typically the +output of get_config.

+
+
Returns
+

A layer instance.

+
+
+
+
+
+get_input_at(node_index)
+

Retrieves the input tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first input node of the layer.

+
+
Returns
+

A tensor (or list of tensors if the layer has multiple inputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_input_mask_at(node_index)
+

Retrieves the input mask tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A mask tensor +(or list of tensors if the layer has multiple inputs).

+
+
+
+
+
+get_input_shape_at(node_index)
+

Retrieves the input shape(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A shape tuple +(or list of shape tuples if the layer has multiple inputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_losses_for(inputs)
+

Deprecated, do NOT use!

+

Retrieves losses relevant to a specific set of inputs.

+
+
Parameters
+

inputs – Input tensor or list/tuple of input tensors.

+
+
Returns
+

List of loss tensors of the layer that depend on inputs.

+
+
+
+
+
+get_output_at(node_index)
+

Retrieves the output tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first output node of the layer.

+
+
Returns
+

A tensor (or list of tensors if the layer has multiple outputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_output_mask_at(node_index)
+

Retrieves the output mask tensor(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A mask tensor +(or list of tensors if the layer has multiple outputs).

+
+
+
+
+
+get_output_shape_at(node_index)
+

Retrieves the output shape(s) of a layer at a given node.

+
+
Parameters
+

node_index – Integer, index of the node +from which to retrieve the attribute. +E.g. node_index=0 will correspond to the +first time the layer was called.

+
+
Returns
+

A shape tuple +(or list of shape tuples if the layer has multiple outputs).

+
+
Raises
+

RuntimeError – If called in Eager mode.

+
+
+
+
+
+get_updates_for(inputs)
+

Deprecated, do NOT use!

+

Retrieves updates relevant to a specific set of inputs.

+
+
Parameters
+

inputs – Input tensor or list/tuple of input tensors.

+
+
Returns
+

List of update ops of the layer that depend on inputs.

+
+
+
+
+
+get_weights()
+

Returns the current weights of the layer, as NumPy arrays.

+

The weights of a layer represent the state of the layer. This function +returns both trainable and non-trainable weight values associated with this +layer as a list of NumPy arrays, which can in turn be used to load state +into similarly parameterized layers.

+

For example, a Dense layer returns a list of two values: the kernel matrix +and the bias vector. These can be used to set the weights of another +Dense layer:

+
>>> layer_a = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(1.))
+>>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+>>> layer_a.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(2.))
+>>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+>>> layer_b.get_weights()
+[array([[2.],
+       [2.],
+       [2.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b.set_weights(layer_a.get_weights())
+>>> layer_b.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+
+
+
+
Returns
+

Weights values as a list of NumPy arrays.

+
+
+
+
+
+property inbound_nodes
+

Deprecated, do NOT use! Only for compatibility with external Keras.

+
+
+
+property input
+

Retrieves the input tensor(s) of a layer.

+

Only applicable if the layer has exactly one input, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Input tensor or list of input tensors.

+
+
Raises
+
    +
  • RuntimeError – If called in Eager mode.

  • +
  • AttributeError – If no inbound nodes are found.

  • +
+
+
+
+
+
+property input_mask
+

Retrieves the input mask tensor(s) of a layer.

+

Only applicable if the layer has exactly one inbound node, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Input mask tensor (potentially None) or list of input +mask tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to

  • +
  • more than one incoming layers.

  • +
+
+
+
+
+
+property input_shape
+

Retrieves the input shape(s) of a layer.

+

Only applicable if the layer has exactly one input, +i.e. if it is connected to one incoming layer, or if all inputs +have the same shape.

+
+
Returns
+

Input shape, as an integer shape tuple +(or list of shape tuples, one tuple per input tensor).

+
+
Raises
+
    +
  • AttributeError – if the layer has no defined input_shape.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+property input_spec
+

InputSpec instance(s) describing the input format for this layer.

+

When you create a layer subclass, you can set self.input_spec to enable +the layer to run input compatibility checks when it is called. +Consider a Conv2D layer: it can only be called on a single input tensor +of rank 4. As such, you can set, in __init__():

+

`python +self.input_spec = tf.keras.layers.InputSpec(ndim=4) +`

+

Now, if you try to call the layer on an input that isn’t rank 4 +(for instance, an input of shape (2,), it will raise a nicely-formatted +error:

+

` +ValueError: Input 0 of layer conv2d is incompatible with the layer: +expected ndim=4, found ndim=1. Full shape received: [2] +`

+

Input checks that can be specified via input_spec include: +- Structure (e.g. a single input, a list of 2 inputs, etc) +- Shape +- Rank (ndim) +- Dtype

+

For more information, see tf.keras.layers.InputSpec.

+
+
Returns
+

A tf.keras.layers.InputSpec instance, or nested structure thereof.

+
+
+
+
+
+property losses
+

List of losses added using the add_loss() API.

+

Variable regularization tensors are created when this property is accessed, +so it is eager safe: accessing losses under a tf.GradientTape will +propagate gradients back to the corresponding variables.

+

Examples:

+
>>> class MyLayer(tf.keras.layers.Layer):
+...   def call(self, inputs):
+...     self.add_loss(tf.abs(tf.reduce_mean(inputs)))
+...     return inputs
+>>> l = MyLayer()
+>>> l(np.ones((10, 1)))
+>>> l.losses
+[1.0]
+
+
+
>>> inputs = tf.keras.Input(shape=(10,))
+>>> x = tf.keras.layers.Dense(10)(inputs)
+>>> outputs = tf.keras.layers.Dense(1)(x)
+>>> model = tf.keras.Model(inputs, outputs)
+>>> # Activity regularization.
+>>> len(model.losses)
+0
+>>> model.add_loss(tf.abs(tf.reduce_mean(x)))
+>>> len(model.losses)
+1
+
+
+
>>> inputs = tf.keras.Input(shape=(10,))
+>>> d = tf.keras.layers.Dense(10, kernel_initializer='ones')
+>>> x = d(inputs)
+>>> outputs = tf.keras.layers.Dense(1)(x)
+>>> model = tf.keras.Model(inputs, outputs)
+>>> # Weight regularization.
+>>> model.add_loss(lambda: tf.reduce_mean(d.kernel))
+>>> model.losses
+[<tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
+
+
+
+
Returns
+

A list of tensors.

+
+
+
+
+
+property metrics
+

List of metrics added using the add_metric() API.

+

Example:

+
>>> input = tf.keras.layers.Input(shape=(3,))
+>>> d = tf.keras.layers.Dense(2)
+>>> output = d(input)
+>>> d.add_metric(tf.reduce_max(output), name='max')
+>>> d.add_metric(tf.reduce_min(output), name='min')
+>>> [m.name for m in d.metrics]
+['max', 'min']
+
+
+
+
Returns
+

A list of Metric objects.

+
+
+
+
+
+property name
+

Name of the layer (string), set in the constructor.

+
+
+
+property name_scope
+

Returns a tf.name_scope instance for this class.

+
+
+
+property non_trainable_variables
+

Sequence of non-trainable variables owned by this module and its submodules.

+

Note: this method uses reflection to find variables on the current instance +and submodules. For performance reasons you may wish to cache the result +of calling this method if you don’t expect the return value to change.

+
+
Returns
+

A sequence of variables for the current module (sorted by attribute +name) followed by variables from all submodules recursively (breadth +first).

+
+
+
+
+
+property non_trainable_weights
+

List of all non-trainable weights tracked by this layer.

+

Non-trainable weights are not updated during training. They are expected +to be updated manually in call().

+
+
Returns
+

A list of non-trainable variables.

+
+
+
+
+
+property outbound_nodes
+

Deprecated, do NOT use! Only for compatibility with external Keras.

+
+
+
+property output
+

Retrieves the output tensor(s) of a layer.

+

Only applicable if the layer has exactly one output, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Output tensor or list of output tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to more than one incoming + layers.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+property output_mask
+

Retrieves the output mask tensor(s) of a layer.

+

Only applicable if the layer has exactly one inbound node, +i.e. if it is connected to one incoming layer.

+
+
Returns
+

Output mask tensor (potentially None) or list of output +mask tensors.

+
+
Raises
+
    +
  • AttributeError – if the layer is connected to

  • +
  • more than one incoming layers.

  • +
+
+
+
+
+
+property output_shape
+

Retrieves the output shape(s) of a layer.

+

Only applicable if the layer has one output, +or if all outputs have the same shape.

+
+
Returns
+

Output shape, as an integer shape tuple +(or list of shape tuples, one tuple per output tensor).

+
+
Raises
+
    +
  • AttributeError – if the layer has no defined output shape.

  • +
  • RuntimeError – if called in Eager mode.

  • +
+
+
+
+
+
+reset_state()
+

Resets all of the metric state variables.

+

This function is called between epochs/steps, +when a metric is evaluated during training.

+
+
+
+reset_states()
+
+
+
+result()
+

Computes and returns the metric value tensor.

+

Result computation is an idempotent operation that simply calculates the +metric value using the state variables.

+
+
+
+set_weights(weights)
+

Sets the weights of the layer, from NumPy arrays.

+

The weights of a layer represent the state of the layer. This function +sets the weight values from numpy arrays. The weight values should be +passed in the order they are created by the layer. Note that the layer’s +weights must be instantiated before calling this function, by calling +the layer.

+

For example, a Dense layer returns a list of two values: the kernel matrix +and the bias vector. These can be used to set the weights of another +Dense layer:

+
>>> layer_a = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(1.))
+>>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+>>> layer_a.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b = tf.keras.layers.Dense(1,
+...   kernel_initializer=tf.constant_initializer(2.))
+>>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+>>> layer_b.get_weights()
+[array([[2.],
+       [2.],
+       [2.]], dtype=float32), array([0.], dtype=float32)]
+>>> layer_b.set_weights(layer_a.get_weights())
+>>> layer_b.get_weights()
+[array([[1.],
+       [1.],
+       [1.]], dtype=float32), array([0.], dtype=float32)]
+
+
+
+
Parameters
+

weights – a list of NumPy arrays. The number +of arrays and their shape must match +number of the dimensions of the weights +of the layer (i.e. it should match the +output of get_weights).

+
+
Raises
+

ValueError – If the provided weights list does not match the + layer’s specifications.

+
+
+
+
+
+property stateful
+
+
+
+property submodules
+

Sequence of all sub-modules.

+

Submodules are modules which are properties of this module, or found as +properties of modules which are properties of this module (and so on).

+
>>> a = tf.Module()
+>>> b = tf.Module()
+>>> c = tf.Module()
+>>> a.b = b
+>>> b.c = c
+>>> list(a.submodules) == [b, c]
+True
+>>> list(b.submodules) == [c]
+True
+>>> list(c.submodules) == []
+True
+
+
+
+
Returns
+

A sequence of all submodules.

+
+
+
+
+
+property supports_masking
+

Whether this layer supports computing a mask using compute_mask.

+
+
+
+property trainable
+
+
+
+property trainable_variables
+

Sequence of trainable variables owned by this module and its submodules.

+

Note: this method uses reflection to find variables on the current instance +and submodules. For performance reasons you may wish to cache the result +of calling this method if you don’t expect the return value to change.

+
+
Returns
+

A sequence of variables for the current module (sorted by attribute +name) followed by variables from all submodules recursively (breadth +first).

+
+
+
+
+
+property trainable_weights
+

List of all trainable weights tracked by this layer.

+

Trainable weights are updated via gradient descent during training.

+
+
Returns
+

A list of trainable variables.

+
+
+
+
+
+update_state(y_true, y_pred, sample_weight=None)
+

Accumulates statistics for the metric.

+

Note: This function is executed as a graph function in graph mode. +This means:

+
+
    +
  1. Operations on the same resource are executed in textual order. +This should make it easier to do things like add the updated +value of a variable to another, for example.

  2. +
  3. You don’t need to worry about collecting the update ops to execute. +All update ops added to the graph by this function will be executed.

  4. +
+

As a result, code should generally work the same way with graph or +eager execution.

+
+
+
Parameters
+
    +
  • *args

  • +
  • **kwargs – A mini-batch of inputs to the Metric.

  • +
+
+
+
+
+
+property updates
+
+
+
+property variable_dtype
+

Alias of Layer.dtype, the dtype of the weights.

+
+
+
+property variables
+

Returns the list of all layer variables/weights.

+

Alias of self.weights.

+

Note: This will not track the weights of nested tf.Modules that are not +themselves Keras layers.

+
+
Returns
+

A list of variables.

+
+
+
+
+
+property weights
+

Returns the list of all layer variables/weights.

+
+
Returns
+

A list of variables.

+
+
+
+
+
+classmethod with_name_scope(method)
+

Decorator to automatically enter the module name scope.

+
>>> class MyModule(tf.Module):
+...   @tf.Module.with_name_scope
+...   def __call__(self, x):
+...     if not hasattr(self, 'w'):
+...       self.w = tf.Variable(tf.random.normal([x.shape[1], 3]))
+...     return tf.matmul(x, self.w)
+
+
+

Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose +names included the module name:

+
>>> mod = MyModule()
+>>> mod(tf.ones([1, 2]))
+<tf.Tensor: shape=(1, 3), dtype=float32, numpy=..., dtype=float32)>
+>>> mod.w
+<tf.Variable 'my_module/Variable:0' shape=(2, 3) dtype=float32,
+numpy=..., dtype=float32)>
+
+
+
+
Parameters
+

method – The method to wrap.

+
+
Returns
+

The original method wrapped such that it enters the module’s name scope.

+
+
+
+
+
+
+dataprofiler.labelers.character_level_cnn_model.build_embd_dictionary(filename)
+

Returns a numpy embedding dictionary from embed file with GloVe-like format

+
+
Parameters
+

filename (str) – Path to the embed file for loading

+
+
+
+
+
+dataprofiler.labelers.character_level_cnn_model.create_glove_char(n_dims, source_file=None)
+

Embeds GloVe chars embeddings from source file to n_dims principal +components in a new file

+
+
Parameters
+
    +
  • n_dims (int) – Final number of principal component dims of the embeddings

  • +
  • source_file (str) – Location of original embeddings to factor down

  • +
+
+
+
+
+
+class dataprofiler.labelers.character_level_cnn_model.CharacterLevelCnnModel(label_mapping=None, parameters=None)
+

Bases: dataprofiler.labelers.base_model.BaseTrainableModel

+

CNN Model Initializer. initialize epoch_id

+
+
Parameters
+
    +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • parameters (dict) –

    Contains all the appropriate parameters for the +model. Must contain num_labels. Other possible parameters are:

    +
    +

    max_length, max_char_encoding_id, dim_embed, size_fc +dropout, size_conv, num_fil, optimizer, default_label

    +
    +

  • +
+
+
Returns
+

None

+
+
+
+
+requires_zero_mapping = True
+
+
+
+set_label_mapping(label_mapping)
+

Sets the labels for the model

+
+
Parameters
+

label_mapping (dict) – label mapping of the model

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves whole model to disk with weights

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads whole model from disk with weights

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

None

+
+
+
+
+
+reset_weights()
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+fit(train_data, val_data=None, batch_size=32, label_mapping=None, reset_weights=False, verbose=True)
+

Train the current model with the training data and validation data

+
+
Parameters
+
    +
  • train_data (Union[list, np.ndarray]) – Training data used to train model

  • +
  • val_data (Union[list, np.ndarray]) – Validation data used to validate the training

  • +
  • batch_size (int) – Used to determine number of samples in each batch

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • reset_weights (bool) – Flag to determine whether to reset the weights or +not

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

None

+
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list. +:param param_list: list of parameters to retrieve from the model. +:type param_list: list +:return: dict of parameters

+
+
+
+classmethod help()
+

Help function describing alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

mapping of labels to their encoded values

+
+
Type
+

return

+
+
+
+
+
+property labels
+

Retrieves the label +:return: list of labels

+
+
+
+property num_labels
+
+
+
+predict(data, batch_size=32, show_confidences=False, verbose=True)
+

Run model and get predictions

+
+
Parameters
+
    +
  • data (Union[list, numpy.ndarray]) – text input

  • +
  • batch_size (int) – number of samples in the batch of data

  • +
  • show_confidences – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+property reverse_label_mapping
+

Reversed order of current labels, useful for when needed to +extract Labels via indices

+
+
Type
+

return

+
+
+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+details()
+

Prints the relevant details of the model (summary, parameters, label +mapping)

+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.classification_report_utils.html b/docs/0.7.1/html/dataprofiler.labelers.classification_report_utils.html new file mode 100644 index 000000000..5d86ac8f7 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.classification_report_utils.html @@ -0,0 +1,436 @@ + + + + + + + + + Classification Report Utils - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Classification Report Utils

+
+
+dataprofiler.labelers.classification_report_utils.convert_confusion_matrix_to_MCM(conf_matrix)
+

Converts a confusion matrix into the MCM format for precision/recall/fscore/ +support computation by sklearn. The format is as specified by sklearn below: +In multilabel confusion matrix \(MCM\), the count of true negatives +is \(MCM_{:,0,0}\), false negatives is \(MCM_{:,1,0}\), +true positives is \(MCM_{:,1,1}\) and false positives is +\(MCM_{:,0,1}\). +Note: this utilizes code/ideology from sklearn.

+
+
Parameters
+

conf_matrix (Union[list, np.ndarray]) – confusion matrix, which is a square matrix describing +false positives and false negatives, true positives and true negatives +for classification

+
+
Returns
+

MCM format for readability by sklearn confusion reports.

+
+
Return type
+

np.ndarray

+
+
+
+
+
+dataprofiler.labelers.classification_report_utils.precision_recall_fscore_support(MCM, beta=1.0, labels=None, pos_label=1, average=None, warn_for=('precision', 'recall', 'f-score'), sample_weight=None)
+

Copy of the precision_recall_fscore_support function from sklearn.metrics +with the update to receiving the MCM instead of calculating each time it is +called.

+
+
Parameters
+
    +
  • MCM (array, shape (n_outputs, 2, 2)) – Multi-classification confusion matrix as referenced by the sklearn +metrics module. A 2x2 confusion matrix corresponding to each output in +the input. In multilabel confusion matrix \(MCM\), the count of +true negatives is \(MCM_{:,0,0}\), false negatives is +\(MCM_{:,1,0}\), true positives is \(MCM_{:,1,1}\) and false +positives is \(MCM_{:,0,1}\).

  • +
  • beta (float, 1.0 by default) – The strength of recall versus precision in the F-score.

  • +
  • labels (list, optional) – The set of labels to include when average != 'binary', and their +order if average is None. Labels present in the data can be +excluded, for example to calculate a multiclass average ignoring a +majority negative class, while labels not present in the data will +result in 0 components in a macro average. For multilabel targets, +labels are column indices. By default, all labels in y_true and +y_pred are used in sorted order.

  • +
  • pos_label (str or int, 1 by default) – The class to report if average='binary' and the data is binary. +If the data are multiclass or multilabel, this will be ignored; +setting labels=[pos_label] and average != 'binary' will report +scores for that label only.

  • +
  • average (string, [None (default), 'binary', 'micro', 'macro', 'weighted']) –

    If None, the scores for each class are returned. Otherwise, this +determines the type of averaging performed on the data:

    +
    +
    'binary':

    Only report results for the class specified by pos_label. +This is applicable only if targets (y_{true,pred}) are binary.

    +
    +
    'micro':

    Calculate metrics globally by counting the total true positives, +false negatives and false positives.

    +
    +
    'macro':

    Calculate metrics for each label, and find their unweighted +mean. This does not take label imbalance into account.

    +
    +
    'weighted':

    Calculate metrics for each label, and find their average weighted +by support (the number of true instances for each label). This +alters ‘macro’ to account for label imbalance; it can result in an +F-score that is not between precision and recall.

    +
    +
    +

  • +
  • warn_for (tuple or set, for internal use) – This determines which warnings will be made in the case that this +function is being used to return only one of its metrics.

  • +
  • sample_weight (array-like of shape = [n_samples], optional) – Sample weights.

  • +
+
+
Returns
+

    +
  • precision (float (if average is not None) or array of float, shape = [n_unique_labels])

  • +
  • recall (float (if average is not None) or array of float, , shape = [n_unique_labels])

  • +
  • fbeta_score (float (if average is not None) or array of float, shape = [n_unique_labels])

  • +
  • support (int (if average is not None) or array of int, shape = [n_unique_labels]) – The number of occurrences of each label in y_true.

  • +
+

+
+
+

References

+
+
1
+

Wikipedia entry for the Precision and recall

+
+
2
+

Wikipedia entry for the F1-score

+
+
3
+

Discriminative Methods for Multi-labeled Classification Advances +in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu +Godbole, Sunita Sarawagi

+
+
+

Notes

+

When true positive + false positive == 0, precision is undefined; +When true positive + false negative == 0, recall is undefined. +In such cases, the metric will be set to 0, as will f-score, and +UndefinedMetricWarning will be raised.

+
+
+
+dataprofiler.labelers.classification_report_utils.classification_report(conf_matrix, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False)
+

Copy of the classification_report function from sklearn.metrics +with the update to receiving the conf_matrix instead of calculating each +time it is called.

+

Build a text report showing the main classification metrics

+

Read more in the User Guide.

+
+
Parameters
+
    +
  • conf_matrix (array, shape = [n_labels, n_labels]) – confusion matrix, which is a square matrix describing +false positives and false negatives, true positives and true negatives +for classification.

  • +
  • labels (array, shape = [n_labels]) – Optional list of label indices to include in the report.

  • +
  • target_names (list of strings) – Optional display names matching the labels (same order).

  • +
  • sample_weight (array-like of shape = [n_samples], optional) – Sample weights.

  • +
  • digits (int) – Number of digits for formatting output floating point values. +When output_dict is True, this will be ignored and the +returned values will not be rounded.

  • +
  • output_dict (bool (default = False)) – If True, return output as dict

  • +
+
+
Returns
+

report – Text summary of the precision, recall, F1 score for each class. +Dictionary returned if output_dict is True. Dictionary has the +following structure:

+
{'label 1': {'precision':0.5,
+             'recall':1.0,
+             'f1-score':0.67,
+             'support':1},
+ 'label 2': { ... },
+  ...
+}
+
+
+

The reported averages include macro average (averaging the unweighted +mean per label), weighted average (averaging the support-weighted mean +per label), sample average (only for multilabel classification) and +micro average (averaging the total true positives, false negatives and +false positives) it is only shown for multi-label or multi-class +with a subset of classes because it is accuracy otherwise. +See also:func:precision_recall_fscore_support for more details +on averages.

+

Note that in binary classification, recall of the positive class +is also known as “sensitivity”; recall of the negative class is +“specificity”.

+

+
+
Return type
+

string / dict

+
+
+
+

See also

+

precision_recall_fscore_support, confusion_matrix, multilabel_confusion_matrix

+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.data_labelers.html b/docs/0.7.1/html/dataprofiler.labelers.data_labelers.html new file mode 100644 index 000000000..b793c9d6c --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.data_labelers.html @@ -0,0 +1,867 @@ + + + + + + + + + Data Labelers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Labelers

+
+
+dataprofiler.labelers.data_labelers.train_structured_labeler(data, default_label=None, save_dirpath=None, epochs=2)
+

Uses provided data to create and save a structured data labeler

+
+
Parameters
+
    +
  • data (Union[None, pd.DataFrame]) – data to be trained upon

  • +
  • save_dirpath (Union[None, str]) – path to save data labeler

  • +
  • epochs (int) – number of epochs to loop training the data

  • +
+
+
Returns
+

+
+
+
+
+
+class dataprofiler.labelers.data_labelers.UnstructuredDataLabeler(dirpath=None, load_options=None)
+

Bases: dataprofiler.labelers.base_data_labeler.BaseDataLabeler

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor=False, error_on_mismatch=False)
+

Checks whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

bool indicating valid pipeline

+
+
+
+
+
+help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

Retrieves the label encodings

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels
+

Retrieves the label

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath, load_options=None)
+

Loads the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_from_library(name)
+

Loads the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_with_components(preprocessor, model, postprocessor)
+

Loads the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

+
+
+
+
+
+property model
+

Retrieves the data labeler model

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor
+

Retrieves the data postprocessor

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data, batch_size=32, predict_options=None, error_on_mismatch=False, verbose=1)
+

Predicts labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data – data to be predicted upon

  • +
  • batch_size – batch size of prediction

  • +
  • predict_options – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
+
+
+
+property preprocessor
+

Retrieves the data preprocessor

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping
+

Retrieves the index to label encoding

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves the data labeler to the specified location

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels)
+

Sets the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_model(model)
+

Set the model for the data labeler

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_params(params)
+

Allows user to set parameters of pipeline components in the following +format:

+
+
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor)
+

Set the data postprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor)
+

Set the data preprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.data_labelers.StructuredDataLabeler(dirpath=None, load_options=None)
+

Bases: dataprofiler.labelers.base_data_labeler.BaseDataLabeler

+

Initialize DataLabeler class.

+
+
Parameters
+
    +
  • dirpath – path to data labeler

  • +
  • load_options – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+check_pipeline(skip_postprocessor=False, error_on_mismatch=False)
+

Checks whether the processors and models connect together without error.

+
+
Parameters
+
    +
  • skip_postprocessor (bool) – skip checking postprocessor is valid in +pipeline

  • +
  • error_on_mismatch (bool) – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
+
+
Returns
+

bool indicating valid pipeline

+
+
+
+
+
+help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

Retrieves the label encodings

+
+
Returns
+

dictionary for associating labels to indexes

+
+
+
+
+
+property labels
+

Retrieves the label

+
+
Returns
+

list of labels

+
+
+
+
+
+classmethod load_from_disk(dirpath, load_options=None)
+

Loads the data labeler from a saved location on disk.

+
+
Parameters
+
    +
  • dirpath (str) – path to data labeler files.

  • +
  • load_options (dict) – optional arguments to include for load i.e. class +for model or processors

  • +
+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_from_library(name)
+

Loads the data labeler from the data labeler zoo in the library.

+
+
Parameters
+

name (str) – name of the data labeler.

+
+
Returns
+

DataLabeler class

+
+
+
+
+
+classmethod load_with_components(preprocessor, model, postprocessor)
+

Loads the data labeler from a its set of components.

+
+
Parameters
+
+
+
Returns
+

+
+
+
+
+
+property model
+

Retrieves the data labeler model

+
+
Returns
+

returns the model instance

+
+
+
+
+
+property postprocessor
+

Retrieves the data postprocessor

+
+
Returns
+

returns the postprocessor instance

+
+
+
+
+
+predict(data, batch_size=32, predict_options=None, error_on_mismatch=False, verbose=1)
+

Predicts labels of input data based with the data labeler model.

+
+
Parameters
+
    +
  • data – data to be predicted upon

  • +
  • batch_size – batch size of prediction

  • +
  • predict_options – optional parameters to allow for predict as a +dict, i.e. dict(show_confidences=True)

  • +
  • error_on_mismatch – if true, errors instead of warns on parameter +mismatches in pipeline

  • +
  • verbose – Flag to determine whether to print status or not

  • +
+
+
Returns
+

predictions

+
+
+
+
+
+property preprocessor
+

Retrieves the data preprocessor

+
+
Returns
+

returns the preprocessor instance

+
+
+
+
+
+property reverse_label_mapping
+

Retrieves the index to label encoding

+
+
Returns
+

dictionary for associating indexes to labels

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves the data labeler to the specified location

+
+
Parameters
+

dirpath (str) – location to save the data labeler.

+
+
Returns
+

None

+
+
+
+
+
+set_labels(labels)
+

Sets the labels for the data labeler.

+
+
Parameters
+

labels (list or dict) – new labels in either encoding list or dict

+
+
Returns
+

None

+
+
+
+
+
+set_model(model)
+

Set the model for the data labeler

+
+
Parameters
+

model (base_model.BaseModel) – model to use within the data labeler

+
+
Returns
+

None

+
+
+
+
+
+set_params(params)
+

Allows user to set parameters of pipeline components in the following +format:

+
+
+
params = dict(

preprocessor=dict(…), +model=dict(…), +postprocessor=dict(…)

+
+
+

)

+
+

where the key,values pairs for each pipeline component must match +parameters that exist in their components.

+
+
Parameters
+

params (dict) –

dictionary containing a key for a given pipeline +component and its associated value of parameters as such:

+
+

dict(preprocessor=dict(…), model=dict(…), +postprocessor=dict(…))

+
+

+
+
Returns
+

None

+
+
+
+
+
+set_postprocessor(data_processor)
+

Set the data postprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPostprocessor) – processor to set as the postprocessor

+
+
Returns
+

None

+
+
+
+
+
+set_preprocessor(data_processor)
+

Set the data preprocessor for the data labeler

+
+
Parameters
+

data_processor (data_processing.BaseDataPreprocessor) – processor to set as the preprocessor

+
+
Returns
+

None

+
+
+
+
+
+
+class dataprofiler.labelers.data_labelers.DataLabeler(labeler_type, dirpath=None, load_options=None, trainable=False)
+

Bases: object

+
+
+labeler_classes = {'structured': <class 'dataprofiler.labelers.data_labelers.StructuredDataLabeler'>, 'unstructured': <class 'dataprofiler.labelers.data_labelers.UnstructuredDataLabeler'>}
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.data_processing.html b/docs/0.7.1/html/dataprofiler.labelers.data_processing.html new file mode 100644 index 000000000..e2a98962a --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.data_processing.html @@ -0,0 +1,1127 @@ + + + + + + + + + Data Processing - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Processing

+
+
+class dataprofiler.labelers.data_processing.AutoSubRegistrationMeta(clsname, bases, attrs)
+

Bases: abc.ABCMeta

+
+
+mro()
+

Return a type’s method resolution order.

+
+
+
+register(subclass)
+

Register a virtual subclass of an ABC.

+

Returns the subclass, to allow usage as a class decorator.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataProcessor(**parameters)
+

Bases: object

+

Abstract Data processing class.

+
+
+processor_type = None
+
+
+
+classmethod get_class(class_name)
+
+
+
+abstract classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+abstract process(*args)
+

Data processing function.

+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataPreprocessor(**parameters)
+

Bases: dataprofiler.labelers.data_processing.BaseDataProcessor

+

Abstract Data preprocessing class.

+
+
+processor_type = 'preprocessor'
+
+
+
+abstract process(data, labels, label_mapping, batch_size)
+

Data preprocessing function.

+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+abstract classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.BaseDataPostprocessor(**parameters)
+

Bases: dataprofiler.labelers.data_processing.BaseDataProcessor

+

Abstract Data postprocessing class.

+
+
+processor_type = 'postprocessor'
+
+
+
+abstract process(data, results, label_mapping)
+

Data postprocessing function.

+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+abstract classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.DirectPassPreprocessor
+

Bases: dataprofiler.labelers.data_processing.BaseDataPreprocessor

+

Initialize the DirectPassPreprocessor class

+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data, labels=None, label_mapping=None, batch_size=None)
+

Data preprocessing function.

+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'preprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.CharPreprocessor(max_length=3400, default_label='UNKNOWN', pad_label='PAD', flatten_split=0, flatten_separator=' ', is_separate_at_max_len=False)
+

Bases: dataprofiler.labelers.data_processing.BaseDataPreprocessor

+

Initialize the CharPreprocessor class

+
+
Parameters
+
    +
  • max_length (int) – Maximum char length in a sample.

  • +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_split (float) – approximate output of split between flattened and +non-flattened characters, value between [0, 1]. When the current +flattened split becomes more than the flatten_split value, any +leftover sample or subsequent samples will be non-flattened until +the current flattened split is below the flatten_split value

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_separate_at_max_len (bool) – if true, separates at max_length, +otherwise at nearest separator

  • +
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+process(data, labels=None, label_mapping=None, batch_size=32)
+

Flatten batches of data

+
+
Parameters
+
    +
  • data (numpy.ndarray) – List of strings to create embeddings for

  • +
  • labels (numpy.ndarray) – labels for each input character

  • +
  • label_mapping (Union[None, dict]) – maps labels to their encoded integers

  • +
  • batch_size (int) – Number of samples in the batch of data

  • +
+
+
Return batch_data
+

A dict containing samples of size batch_size

+
+
Rtype batch_data
+

dicts

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'preprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.CharPostprocessor(default_label='UNKNOWN', pad_label='PAD', flatten_separator=' ', use_word_level_argmax=False, output_format='character_argmax', separators=(' ', ',', ';', "'", '"', ':', '\n', '\t', '.'), word_level_min_percent=0.75)
+

Bases: dataprofiler.labelers.data_processing.BaseDataPostprocessor

+

Initialize the CharPostprocessor class

+
+
Parameters
+
    +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • use_word_level_argmax (bool) – whether to require the argmax value of +each character in a word to determine the word’s entity

  • +
  • output_format (str) – (character_argmax vs NER) where character_argmax +is a list of encodings for each character in the input text and NER +is in the dict format which specifies start,end,label for each +entity in a sentence

  • +
  • separators (tuple(str)) – list of characters to use for separating words within +the character predictions

  • +
  • word_level_min_percent (float) – threshold on generating dominant +word_level labeling

  • +
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static convert_to_NER_format(predictions, label_mapping, default_label, pad_label)
+

Converts word level predictions to specified format

+
+
Parameters
+
    +
  • predictions (list) – predictions

  • +
  • label_mapping (dict) – labels and corresponding integers

  • +
  • default_label (str) – default label in label_mapping

  • +
  • pad_label (str) – pad label in label_mapping

  • +
+
+
Returns
+

formatted predictions

+
+
Return type
+

list

+
+
+
+
+
+static match_sentence_lengths(data, results, flatten_separator, inplace=True)
+

Converts the results from the model into the same ragged data shapes as +the original data.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • flatten_separator (str) – string which joins to samples together when +flattening

  • +
  • inplace (bool) – flag to modify results in place

  • +
+
+
Returns
+

dict(pred=…) or dict(pred=…, conf=…)

+
+
+
+
+
+process(data, results, label_mapping)
+

Conducts the processing on the data given the predictions, +label_mapping, and default_label.

+
+
Parameters
+
    +
  • data (np.ndarray) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • label_mapping (dict) – labels and corresponding integers

  • +
+
+
Returns
+

dict of predictions and if they exist, confidences

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'postprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.StructCharPreprocessor(max_length=3400, default_label='UNKNOWN', pad_label='PAD', flatten_separator='\x01\x01\x01\x01\x01', is_separate_at_max_len=False)
+

Bases: dataprofiler.labelers.data_processing.CharPreprocessor

+

Initialize the StructCharPreprocessor class

+
+
Parameters
+
    +
  • max_length (int) – Maximum char length in a sample.

  • +
  • default_label (string (could be int, char, etc.)) – Key for label_mapping that is the default label

  • +
  • pad_label (string (could be int, char, etc.)) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • is_separate_at_max_len (bool) – if true, separates at max_length, +otherwise at nearest separator

  • +
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for preprocessors.

+
+
Returns
+

None

+
+
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+convert_to_unstructured_format(data, labels)
+

Converts the list of data samples into the CharPreprocessor +required input data format.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – list of strings

  • +
  • labels (list) – labels for each input character

  • +
+
+
Returns
+

data in the following format +text=”<SAMPLE><SEPARATOR><SAMPLE>…”, +entities=[(start=<INT>, end=<INT>, label=”<LABEL>”),

+
+

…(num_samples in data)])

+
+

+
+
+
+
+
+process(data, labels=None, label_mapping=None, batch_size=32)
+

Process structured data for being processed by the +CharacterLevelCnnModel.

+
+
Parameters
+
    +
  • data (numpy.ndarray) – List of strings to create embeddings for

  • +
  • labels (numpy.ndarray) – labels for each input character

  • +
  • label_mapping (Union[dict, None]) – maps labels to their encoded integers

  • +
  • batch_size (int) – Number of samples in the batch of data

  • +
+
+
Return batch_data
+

A dict containing samples of size batch_size

+
+
Rtype batch_data
+

dict

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'preprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.StructCharPostprocessor(default_label='UNKNOWN', pad_label='PAD', flatten_separator='\x01\x01\x01\x01\x01', random_state=None)
+

Bases: dataprofiler.labelers.data_processing.BaseDataPostprocessor

+

Initialize the StructCharPostprocessor class

+
+
Parameters
+
    +
  • default_label (str) – Key for label_mapping that is the default label

  • +
  • pad_label (str) – Key for label_mapping that is the pad label

  • +
  • flatten_separator (str) – separator used to put between flattened +samples.

  • +
  • random_state (random.Random) – random state setting to be used for randomly +selecting a prediction when two labels have equal opportunity for +a given sample.

  • +
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static match_sentence_lengths(data, results, flatten_separator, inplace=True)
+

Converts the results from the model into the same ragged data shapes as +the original data.

+
+
Parameters
+
    +
  • data (np.ndarray) – original input data to the data labeler

  • +
  • results (dict) – dict of model character level predictions and confs

  • +
  • flatten_separator (str) – string which joins to samples together when +flattening

  • +
  • inplace (bool) – flag to modify results in place

  • +
+
+
Returns
+

dict(pred=…) or dict(pred=…, conf=…)

+
+
+
+
+
+convert_to_structured_analysis(sentences, results, label_mapping, default_label, pad_label)
+

Converts unstructured results to a structured column analysis assuming +the column was flattened into a single sample. This takes the mode of +all character predictions except for the separator labels. In cases of +tie, chose anything but background, otherwise randomly choose between +the remaining labels.

+
+
Parameters
+
    +
  • sentences (list(str)) – samples which were predicted upon

  • +
  • results (dict) – character predictions for each sample return from model

  • +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • default_label (str) – Key for label_mapping that is the default label

  • +
  • pad_label (str) – Key for label_mapping that is the pad label

  • +
+
+
Returns
+

prediction value for a single column

+
+
+
+
+
+process(data, results, label_mapping)
+

Postprocessing of CharacterLevelCnnModel results when given structured +data processed by StructCharPreprocessor.

+
+
Parameters
+
    +
  • data (Union[numpy.ndarray, pandas.DataFrame]) – original input data to the data labeler

  • +
  • results – dict of model character level predictions and confs

  • +
  • results – dict

  • +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
+
+
Returns
+

dict of predictions and if they exist, confidences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'postprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+
+class dataprofiler.labelers.data_processing.RegexPostProcessor(aggregation_func='split', priority_order=None, random_state=None)
+

Bases: dataprofiler.labelers.data_processing.BaseDataPostprocessor

+

Initialize the RegexPostProcessor class

+
+
Parameters
+
    +
  • aggregation_func (str) – aggregation function to apply to regex model +output (split, random, priority)

  • +
  • priority_order (Union[list, numpy.ndarray]) – if priority is set as the aggregation function, +the order in which entities are given priority must be set

  • +
  • random_state (random.Random) – random state setting to be used for randomly +selecting a prediction when two labels have equal opportunity for +a given sample.

  • +
+
+
+
+
+classmethod help()
+

Help function describing alterable parameters, input data formats +for preprocessors, and output data formats for postprocessors.

+
+
Returns
+

None

+
+
+
+
+
+static priority_prediction(results, entity_priority_order)
+

Aggregation function using priority of regex to give entity +determination.

+
+
Parameters
+
    +
  • results (dict) – regex from model in format: dict(pred=…, conf=…)

  • +
  • entity_priority_order (np.ndarray) – list of entity priorities (lowest has +higher priority)

  • +
+
+
Returns
+

aggregated predictions

+
+
+
+
+
+static split_prediction(results)
+

Splits the prediction across votes. +:param results: regex from model in format: dict(pred=…, conf=…) +:type results: dict +:return: aggregated predictions

+
+
+
+process(data, labels=None, label_mapping=None, batch_size=None)
+

Data preprocessing function.

+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list.

+
+
Parameters
+

param_list (list) – list of parameters to retrieve from the model.

+
+
Returns
+

dict of parameters

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads a data processor from a given path on disk

+
+
+
+classmethod load_from_library(name)
+

Loads a data processor from within the library

+
+
+
+processor_type = 'postprocessor'
+
+
+
+save_to_disk(dirpath)
+

Saves a data processor to a path on disk.

+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.html b/docs/0.7.1/html/dataprofiler.labelers.html new file mode 100644 index 000000000..48fdaea24 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.html @@ -0,0 +1,346 @@ + + + + + + + + + Labelers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labelers

+
+

Modules

+
+
+ +

The following will list the built-in models, processors, and data labelers.

+
+
Models:
    +
  1. CharacterLevelCnnModel - character classification of text.

  2. +
  3. RegexModel - character classification of text.

  4. +
+
+
Processors:
+
Preprocessors
    +
  1. CharPreprocessor

  2. +
  3. StructCharPreprocessor

  4. +
  5. DirectPassPreprocessor

  6. +
+
+
PostProcessors
    +
  1. CharPreprocessor

  2. +
  3. StructCharPostprocessor

  4. +
  5. RegexPostProcessor

  6. +
+
+
+
+
Data Labelers:
+
Classes
    +
  1. UnstructuredDataLabeler

  2. +
  3. StructuredDataLabeler

  4. +
+
+
Files to load from disk using BaseDataLabeler.load_from_library(<NAME>)
    +
  1. unstructured_model

  2. +
  3. structured_model

  4. +
  5. regex_model

  6. +
+
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.labeler_utils.html b/docs/0.7.1/html/dataprofiler.labelers.labeler_utils.html new file mode 100644 index 000000000..7ea485b57 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.labeler_utils.html @@ -0,0 +1,317 @@ + + + + + + + + + Labeler Utils - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Labeler Utils

+
+
+dataprofiler.labelers.labeler_utils.f1_report_dict_to_str(f1_report, label_names)
+

Returns the report string from the f1_report dict.

+
+
Example Output:
+
+

precision recall f1-score support

+
+

class 0 0.00 0.00 0.00 1 +class 1 1.00 0.67 0.80 3

+
+

micro avg 0.67 0.50 0.57 4 +macro avg 0.50 0.33 0.40 4

+
+
+

weighted avg 0.75 0.50 0.60 4

+

Note: this is generally taken from the classification_report function +inside sklearn. +:param f1_report: f1 report dictionary from sklearn +:type f1_report: dict +:param label_names: names of labels included in the report +:type label_names: list(str) +:return: string representing f1_report printout +:rtype: str

+
+
+
+dataprofiler.labelers.labeler_utils.evaluate_accuracy(predicted_entities_in_index, true_entities_in_index, num_labels, entity_rev_dict, verbose=True, omitted_labels=('PAD', 'UNKNOWN'), confusion_matrix_file=None)
+

Evaluate the accuracy from comparing the predicted labels with true labels

+
+
Parameters
+
    +
  • predicted_entities_in_index (list(array(int))) – predicted encoded labels for input +sentences

  • +
  • true_entities_in_index (list(array(int))) – true encoded labels for input sentences

  • +
  • entity_rev_dict (dict([index, entity])) – dictionary to convert indices to entities

  • +
  • verbose (boolean) – print additional information for debugging

  • +
  • omitted_labels (list() of text labels) – labels to omit from the accuracy evaluation

  • +
  • confusion_matrix_file (str) – File name (and dir) for confusion matrix

  • +
+
+
+

:return : f1-score +:rtype: float

+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.labelers.regex_model.html b/docs/0.7.1/html/dataprofiler.labelers.regex_model.html new file mode 100644 index 000000000..72ad21074 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.labelers.regex_model.html @@ -0,0 +1,477 @@ + + + + + + + + + Regex Model - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Regex Model

+
+
+class dataprofiler.labelers.regex_model.RegexModel(label_mapping=None, parameters=None)
+

Bases: dataprofiler.labelers.base_model.BaseModel

+

Regex Model Initializer.

+
+
Example regex_patterns:
+
regex_patterns = {
+
“LABEL_1”: [

“LABEL_1_pattern_1”, +“LABEL_1_pattern_2”, +…

+
+
+

], +“LABEL_2”: [

+
+

“LABEL_2_pattern_1”, +“LABEL_2_pattern_2”, +…

+
+
+
+

}

+
+
Example encapsulators:
+
encapsulators = {

‘start’: r’(?<![w.$%-])’, +‘end’: r’(?:(?=(|[ ]))|(?=[^w%$]([^w]|$))|$)’,

+
+
+

}

+
+
+
+
Parameters
+
    +
  • label_mapping (dict) – maps labels to their encoded integers

  • +
  • parameters (dict) –

    Contains all the appropriate parameters for the model. +Possible parameters are:

    +
    +

    max_length, max_num_chars, dim_embed

    +
    +

  • +
+
+
Returns
+

None

+
+
+
+
+reset_weights()
+

Reset the weights of the model.

+
+
Returns
+

None

+
+
+
+
+
+predict(data, batch_size=None, show_confidences=False, verbose=True)
+

Applies the regex patterns (within regex_model) to the input_string, +create predictions for all matching patterns. Each pattern has an +associated entity and the predictions of each character within the +string are given a True or False identification for each entity. All +characters not identified by ANY of the regex patterns in the +pattern_dict are considered background characters, and are replaced with +the default_label value.

+
+
Parameters
+
    +
  • data (iterator) – list of strings to predict upon

  • +
  • batch_size (N/A) – does not impact this model and should be fixed to not +be required.

  • +
  • show_confidences – whether user wants prediction confidences

  • +
  • verbose (bool) – Flag to determine whether to print status or not

  • +
+
+
Returns
+

char level predictions and confidences

+
+
Return type
+

dict

+
+
+
+
+
+classmethod load_from_disk(dirpath)
+

Loads whole model from disk with weights

+
+
Parameters
+

dirpath (str) – directory path where you want to load the model from

+
+
Returns
+

None

+
+
+
+
+
+save_to_disk(dirpath)
+

Saves whole model to disk with weights.

+
+
Parameters
+

dirpath (str) – directory path where you want to save the model to

+
+
Returns
+

None

+
+
+
+
+
+add_label(label, same_as=None)
+

Adds a label to the data labeler.

+
+
Parameters
+
    +
  • label (str) – new label being added to the data labeler

  • +
  • same_as (str) – label to have the same encoding index as for multi-label +to single encoding index.

  • +
+
+
Returns
+

None

+
+
+
+
+
+classmethod get_class(class_name)
+
+
+
+get_parameters(param_list=None)
+

Returns a dict of parameters from the model given a list. +:param param_list: list of parameters to retrieve from the model. +:type param_list: list +:return: dict of parameters

+
+
+
+classmethod help()
+

Help function describing alterable parameters.

+
+
Returns
+

None

+
+
+
+
+
+property label_mapping
+

mapping of labels to their encoded values

+
+
Type
+

return

+
+
+
+
+
+property labels
+

Retrieves the label +:return: list of labels

+
+
+
+property num_labels
+
+
+
+requires_zero_mapping = False
+
+
+
+property reverse_label_mapping
+

Reversed order of current labels, useful for when needed to +extract Labels via indices

+
+
Type
+

return

+
+
+
+
+
+set_label_mapping(label_mapping)
+

Sets the labels for the model

+
+
Parameters
+

label_mapping (Union[list, dict]) – label mapping of the model or list of labels to be +converted into the label mapping

+
+
Returns
+

None

+
+
+
+
+
+set_params(**kwargs)
+

Given kwargs, set the parameters if they exist.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.base_column_profilers.html b/docs/0.7.1/html/dataprofiler.profilers.base_column_profilers.html new file mode 100644 index 000000000..1593a6636 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.base_column_profilers.html @@ -0,0 +1,367 @@ + + + + + + + + + Base Column Profilers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Column Profilers

+

coding=utf-8

+

Profiles the data.

+
+
+class dataprofiler.profilers.base_column_profilers.BaseColumnProfiler(name)
+

Bases: object

+

Abstract class for profiling a column of data.

+

Initialization of base class properties for the subclass.

+
+
Parameters
+

name (String) – Name of the dataset

+
+
+
+
+col_type = None
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for columns.

+
+
Parameters
+

other_profile (BaseColumnProfiler) – profile to find the difference with

+
+
Returns
+

the stat differences

+
+
Return type
+

dict

+
+
+
+
+
+abstract update(df_series)
+

Private abstract method for updating the profile.

+
+
Parameters
+

df_series (Pandas Dataframe) – Data to profile.

+
+
+
+
+
+abstract property profile
+

Property for profile. Returns the profile of the column.

+
+
+
+
+class dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler(name)
+

Bases: dataprofiler.profilers.base_column_profilers.BaseColumnProfiler

+

Abstract class for profiling the primative data type for a column of data.

+

Initialization of base class properties for the subclass.

+
+
Parameters
+

name (String) – Name of the data

+
+
+
+
+col_type = None
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for columns.

+
+
Parameters
+

other_profile (BaseColumnProfiler) – profile to find the difference with

+
+
Returns
+

the stat differences

+
+
Return type
+

dict

+
+
+
+
+
+abstract property profile
+

Property for profile. Returns the profile of the column.

+
+
+
+abstract update(df_series)
+

Private abstract method for updating the profile.

+
+
Parameters
+

df_series (Pandas Dataframe) – Data to profile.

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.categorical_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.categorical_column_profile.html new file mode 100644 index 000000000..93d5bd816 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.categorical_column_profile.html @@ -0,0 +1,373 @@ + + + + + + + + + Categorical Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Categorical Column Profile

+
+
+class dataprofiler.profilers.categorical_column_profile.CategoricalColumn(name, options=None)
+

Bases: dataprofiler.profilers.base_column_profilers.BaseColumnProfiler

+

Categorical column profile subclass of BaseColumnProfiler. Represents a +column int the dataset which is a categorical column.

+

Initialization of column base properties and itself.

+
+
Parameters
+

name (String) – Name of data

+
+
+
+
+type = 'category'
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for CategoricalColumns.

+
+
Parameters
+

other_profile (CategoricalColumn) – profile to find the difference with

+
+
Returns
+

the CategoricalColumn differences

+
+
Return type
+

dict

+
+
+
+
+
+property profile
+

Property for profile. Returns the profile of the column. +For categorical_count, it will display the top k categories most +frequently occurred in descending order.

+
+
+
+property categories
+

Property for categories.

+
+
+
+property unique_ratio
+

Property for unique_ratio. Returns ratio of unique +categories to sample_size

+
+
+
+property is_match
+

Property for is_match. Returns true if column is categorical.

+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – Data to profile.

+
+
Returns
+

None

+
+
+
+
+
+property gini_impurity
+

Property for Gini Impurity. Gini Impurity is a way to calculate +likelihood of an incorrect classification of a new instance of +a random variable.

+

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. +We are traversing through categories and calculating with the column

+
+
Returns
+

None or Gini Impurity probability

+
+
+
+
+
+col_type = None
+
+
+
+property unalikeability
+

Property for Unlikeability. Unikeability checks for +“how often observations differ from one another” +Reference: Perry, M. and Kader, G. Variation as Unalikeability. +Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

+

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) +Cij = 1 if i!=j, 0 if i=j

+
+
Returns
+

None or unlikeability probability

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.column_profile_compilers.html b/docs/0.7.1/html/dataprofiler.profilers.column_profile_compilers.html new file mode 100644 index 000000000..e419878e5 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.column_profile_compilers.html @@ -0,0 +1,510 @@ + + + + + + + + + Column Profile Compilers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Column Profile Compilers

+
+
+class dataprofiler.profilers.column_profile_compilers.BaseCompiler(df_series=None, options=None, pool=None)
+

Bases: object

+
+
+abstract property profile
+
+
+
+diff(other, options=None)
+

Finds the difference between 2 compilers and returns the report

+
+
Parameters
+

other (BaseCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series, pool=None)
+

Updates the profiles from the data frames

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnPrimitiveTypeProfileCompiler(df_series=None, options=None, pool=None)
+

Bases: dataprofiler.profilers.column_profile_compilers.BaseCompiler

+
+
+property profile
+
+
+
+property selected_data_type
+

Finds the selected data_type in a primitive compiler

+
+
Returns
+

name of the selected data type

+
+
Return type
+

str

+
+
+
+
+
+diff(other, options=None)
+

Finds the difference between 2 compilers and returns the report

+
+
Parameters
+

other (ColumnPrimitiveTypeProfileCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series, pool=None)
+

Updates the profiles from the data frames

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnStatsProfileCompiler(df_series=None, options=None, pool=None)
+

Bases: dataprofiler.profilers.column_profile_compilers.BaseCompiler

+
+
+property profile
+
+
+
+diff(other, options=None)
+

Finds the difference between 2 compilers and returns the report

+
+
Parameters
+

other (ColumnStatsProfileCompiler) – profile compiler finding the difference with this one.

+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series, pool=None)
+

Updates the profiles from the data frames

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.ColumnDataLabelerCompiler(df_series=None, options=None, pool=None)
+

Bases: dataprofiler.profilers.column_profile_compilers.BaseCompiler

+
+
+property profile
+
+
+
+diff(other, options=None)
+

Finds the difference between 2 compilers and returns the report

+
+
Parameters
+
    +
  • other (ColumnDataLabelerCompiler) – profile compiler finding the difference with this one.

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series, pool=None)
+

Updates the profiles from the data frames

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+
+class dataprofiler.profilers.column_profile_compilers.UnstructuredCompiler(df_series=None, options=None, pool=None)
+

Bases: dataprofiler.profilers.column_profile_compilers.BaseCompiler

+
+
+property profile
+
+
+
+diff(other, options=None)
+

Finds the difference between 2 compilers and returns the report

+
+
Parameters
+
    +
  • other (UnstructuredCompiler) – profile compiler finding the difference with this one.

  • +
  • options (dict) – options to impact the results of the diff

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(df_series, pool=None)
+

Updates the profiles from the data frames

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column, assume df_series in str

  • +
  • pool (multiprocessing.Pool) – pool to utilized for multiprocessing

  • +
+
+
Returns
+

Self

+
+
Return type
+

BaseCompiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.data_labeler_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.data_labeler_column_profile.html new file mode 100644 index 000000000..b75f788c1 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.data_labeler_column_profile.html @@ -0,0 +1,359 @@ + + + + + + + + + Data Labeler Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Labeler Column Profile

+
+
+class dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn(name, options=None)
+

Bases: dataprofiler.profilers.base_column_profilers.BaseColumnProfiler

+

Initialization of Data Label profiling for structured datasets.

+
+
Parameters
+
    +
  • name (String) – name of column being profiled

  • +
  • options (DataLabelerOptions) – Options for the data labeler column

  • +
+
+
+
+
+type = 'data_labeler'
+
+
+
+static assert_equal_conditions(data_labeler, data_labeler2)
+

Ensures data labelers have the same values. Raises error otherwise.

+
+
Parameters
+
+
+
Returns
+

None

+
+
+
+
+
+property data_label
+

Returns the data labels which best fit the data it has seen based on +the DataLabeler used. Data labels must be within the minimum probability +differential of the top predicted value. If nothing is more than +minimum top label value, it says it could not determine the data label.

+
+
+
+property avg_predictions
+

Averages all sample predictions for each data label.

+
+
+
+property label_representation
+

Representation of label found within the dataset based on ranked voting. +When top_k=1, this is simply the distribution of data labels found +within the dataset.

+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
+
+col_type = None
+
+
+
+diff(other_profile, options=None)
+

Generates the differences between the orders of two DataLabeler columns

+
+
Returns
+

Dict containing the differences between orders in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.datetime_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.datetime_column_profile.html new file mode 100644 index 000000000..2ae8381cc --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.datetime_column_profile.html @@ -0,0 +1,338 @@ + + + + + + + + + Datetime Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Datetime Column Profile

+
+
+class dataprofiler.profilers.datetime_column_profile.DateTimeColumn(name, options=None)
+

Bases: dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler

+

Datetime column profile subclass of BaseColumnProfiler. Represents a column +int the dataset which is a datetime column.

+

Initialization of column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (DateTimeOptions) – Options for the datetime column

  • +
+
+
+
+
+type = 'datetime'
+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
+
+property data_type_ratio
+

Calculates the ratio of samples which match this data type.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+diff(other_profile, options=None)
+

Generates the differences between max, min, and formats of two DateTime columns

+
+
Returns
+

Dict containing the differences between max, min, and format in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+col_type = None
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.float_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.float_column_profile.html new file mode 100644 index 000000000..9840db7fd --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.float_column_profile.html @@ -0,0 +1,421 @@ + + + + + + + + + Float Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Float Column Profile

+
+
+class dataprofiler.profilers.float_column_profile.FloatColumn(name, options=None)
+

Bases: dataprofiler.profilers.numerical_column_stats.NumericStatsMixin, dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler

+

Float column profile mixin with of numerical stats. Represents a column in +the dataset which is a float column.

+

Initialization of column base properties and itself. +:param name: Name of the data +:type name: String +:param options: Options for the float column +:type options: FloatOptions

+
+
+type = 'float'
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for FloatColumns.

+
+
Parameters
+

other_profile (FloatColumn) – profile to find the difference with

+
+
Returns
+

the FloatColumn differences

+
+
Return type
+

dict

+
+
+
+
+
+property profile
+

Property for profile. Returns the profile of the column. +:return:

+
+
+
+property precision
+

Property reporting statistics on the significant figures of each +element in the data. +:return: Precision statistics +:rtype: dict

+
+
+
+property data_type_ratio
+

Calculates the ratio of samples which match this data type. +:return: ratio of data type +:rtype: float

+
+
+
+col_type = None
+
+
+
+static is_float(x)
+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x)
+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis
+
+
+
+property mean
+
+
+
+property mode
+

Finds an estimate for the mode(s) of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val)
+

Converts numpy variables to base python type variables

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness
+
+
+
+property stddev
+
+
+
+update(df_series)
+

Updates the column profile. +:param df_series: df series +:type df_series: pandas.core.series.Series +:return: None

+
+
+
+property variance
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.helpers.html b/docs/0.7.1/html/dataprofiler.profilers.helpers.html new file mode 100644 index 000000000..afbd21526 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.helpers.html @@ -0,0 +1,302 @@ + + + + + + + + + Helpers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.helpers.report_helpers.html b/docs/0.7.1/html/dataprofiler.profilers.helpers.report_helpers.html new file mode 100644 index 000000000..ee9816796 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.helpers.report_helpers.html @@ -0,0 +1,296 @@ + + + + + + + + + Report Helpers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Report Helpers

+
+
+dataprofiler.profilers.helpers.report_helpers.calculate_quantiles(num_quantile_groups, quantiles)
+
+
+
+dataprofiler.profilers.helpers.report_helpers.flat_dict(od, separator='_', key='')
+

Function to flatten nested dictionary. Each level is collapsed and +joined with the specified seperator.

+
+
Parameters
+
    +
  • od (dict) – dictionary or dictionary-like object

  • +
  • seperator (str) – character(s) joining successive levels

  • +
  • key (str) – concatenated keys

  • +
+
+
Returns
+

unnested dictionary

+
+
Return type
+

dict

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.histogram_utils.html b/docs/0.7.1/html/dataprofiler.profilers.histogram_utils.html new file mode 100644 index 000000000..73708d28f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.histogram_utils.html @@ -0,0 +1,273 @@ + + + + + + + + + Histogram Utils - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.html b/docs/0.7.1/html/dataprofiler.profilers.html new file mode 100644 index 000000000..5afa97e2b --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.html @@ -0,0 +1,326 @@ + + + + + + + + + Profilers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+ + +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.int_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.int_column_profile.html new file mode 100644 index 000000000..b7de33585 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.int_column_profile.html @@ -0,0 +1,432 @@ + + + + + + + + + Int Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Int Column Profile

+
+
+class dataprofiler.profilers.int_column_profile.IntColumn(name, options=None)
+

Bases: dataprofiler.profilers.numerical_column_stats.NumericStatsMixin, dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler

+

Integer column profile mixin with of numerical stats. Represents a column in +the dataset which is an integer column.

+

Initialization of column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (IntOptions) – Options for the integer column

  • +
+
+
+
+
+type = 'int'
+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
Returns
+

+
+
+
+
+
+property data_type_ratio
+

Calculates the ratio of samples which match this data type.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+col_type = None
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for several numerical stats.

+
+
Parameters
+

other_profile (NumericStatsMixin Profile) – profile to find the difference with

+
+
Returns
+

the numerical stats differences

+
+
Return type
+

dict

+
+
+
+
+
+static is_float(x)
+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x)
+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis
+
+
+
+property mean
+
+
+
+property mode
+

Finds an estimate for the mode(s) of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val)
+

Converts numpy variables to base python type variables

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness
+
+
+
+property stddev
+
+
+
+property variance
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.numerical_column_stats.html b/docs/0.7.1/html/dataprofiler.profilers.numerical_column_stats.html new file mode 100644 index 000000000..ebcb996ff --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.numerical_column_stats.html @@ -0,0 +1,417 @@ + + + + + + + + + Numerical Column Stats - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Numerical Column Stats

+

coding=utf-8 +Build model for a dataset by identifying type of column along with its +respective parameters.

+
+
+class dataprofiler.profilers.numerical_column_stats.abstractstaticmethod(function)
+

Bases: staticmethod

+
+
+
+class dataprofiler.profilers.numerical_column_stats.NumericStatsMixin(options=None)
+

Bases: object

+

Abstract numerical column profile subclass of BaseColumnProfiler. Represents +a column in the dataset which is a text column. Has Subclasses itself.

+

Initialization of column base properties and itself.

+
+
Parameters
+

options (NumericalOptions) – Options for the numerical stats.

+
+
+
+
+type = None
+
+
+
+profile()
+

Property for profile. Returns the profile of the column. +:return:

+
+
+
+diff(other_profile, options=None)
+

Finds the differences for several numerical stats.

+
+
Parameters
+

other_profile (NumericStatsMixin Profile) – profile to find the difference with

+
+
Returns
+

the numerical stats differences

+
+
Return type
+

dict

+
+
+
+
+
+property mean
+
+
+
+property mode
+

Finds an estimate for the mode(s) of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+property variance
+
+
+
+property stddev
+
+
+
+property skewness
+
+
+
+property kurtosis
+
+
+
+abstract update(df_series)
+

Abstract Method for updating the numerical profile properties with an +uncleaned dataset.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series with nulls removed

+
+
Returns
+

None

+
+
+
+
+
+static is_float(x)
+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x)
+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+static np_type_to_type(val)
+

Converts numpy variables to base python type variables

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.order_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.order_column_profile.html new file mode 100644 index 000000000..773dff8d6 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.order_column_profile.html @@ -0,0 +1,330 @@ + + + + + + + + + Order Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Order Column Profile

+
+
+class dataprofiler.profilers.order_column_profile.OrderColumn(name, options=None)
+

Bases: dataprofiler.profilers.base_column_profilers.BaseColumnProfiler

+

Index column profile subclass of BaseColumnProfiler. Represents a column in +the dataset which is an index column.

+

Initialization of column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (OrderOptions) – Options for the Order column

  • +
+
+
+
+
+type = 'order'
+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
Returns
+

+
+
+
+
+
+diff(other_profile, options=None)
+

Generates the differences between the orders of two OrderColumns

+
+
Returns
+

Dict containing the differences between orders in their

+
+
+

appropriate output formats +:rtype: dict

+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+col_type = None
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.profile_builder.html b/docs/0.7.1/html/dataprofiler.profilers.profile_builder.html new file mode 100644 index 000000000..37be334bc --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.profile_builder.html @@ -0,0 +1,789 @@ + + + + + + + + + Profile Builder - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profile Builder

+

coding=utf-8

+

Build model for a dataset by identifying type of column along with its +respective parameters.

+
+
+class dataprofiler.profilers.profile_builder.StructuredColProfiler(df_series=None, sample_size=None, min_sample_size=5000, sampling_ratio=0.2, min_true_samples=None, sample_ids=None, pool=None, options=None)
+

Bases: object

+

Instantiate the StructuredColProfiler class for a given column.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – Data to be profiled

  • +
  • sample_size (int) – Number of samples to use in generating profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
  • pool (multiprocessing.Pool) – pool utilized for multiprocessing

  • +
  • options (StructuredOptions Object) – Options for the structured profiler.

  • +
+
+
+
+
+update_column_profilers(clean_sampled_df, pool)
+

Calculates type statistics and labels dataset

+
+
Parameters
+
    +
  • clean_sampled_df (Pandas.Series) – sampled series with none types dropped

  • +
  • pool (multiprocessing.pool) – pool utilized for multiprocessing

  • +
+
+
+
+
+
+diff(other_profile, options=None)
+

Finds the difference between 2 StructuredCols and returns the report

+
+
Parameters
+
    +
  • other (StructuredColProfiler) – Structured col finding the difference with this one.

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the structured column

+
+
Return type
+

dict

+
+
+
+
+
+property profile
+
+
+
+update_profile(df_series, sample_size=None, min_true_samples=None, sample_ids=None, pool=None)
+

Update the column profiler

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – Data to be profiled

  • +
  • sample_size (int) – Number of samples to use in generating profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
  • pool (multiprocessing.Pool) – pool utilized for multiprocessing

  • +
+
+
+
+
+
+static clean_data_and_get_base_stats(df_series, sample_size, null_values=None, min_true_samples=None, sample_ids=None)
+

Identify null characters and return them in a dictionary as well as +remove any nulls in column.

+
+
Parameters
+
    +
  • df_series (pandas.core.series.Series) – a given column

  • +
  • sample_size (int) – Number of samples to use in generating the profile

  • +
  • null_values (dict[str, re.FLAG]) – Dictionary mapping null values to regex flag where +the key represents the null value to remove from the data and the +flag represents the regex flag to apply

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • sample_ids (list(list)) – Randomized list of sample indices

  • +
+
+
Returns
+

updated column with null removed and dictionary of null +parameters

+
+
Return type
+

pd.Series, dict

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.BaseProfiler(data, samples_per_update=None, min_true_samples=0, options=None)
+

Bases: object

+

Instantiate the BaseProfiler class

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

Profiler

+
+
+
+
+diff(other_profile, options=None)
+

Finds the difference of two profiles +:param other: profile being added to this one. +:type other: BaseProfiler +:return: diff of the two profiles +:rtype: dict

+
+
+
+property profile
+

Returns the stored profiles for the given profiler.

+
+
Returns
+

None

+
+
+
+
+
+report(report_options=None)
+

Returns the profile report based on all profiled data fed into the +profiler. User can specify the output_formats: (pretty, compact, +serializable, flat).

+
+
+
Pretty: floats are rounded to four decimal places, and lists are

shortened.

+
+
Compact: Similar to pretty, but removes detailed statistics such as

runtimes, label probabilities, index locations of null types, +etc.

+
+
+

Serializable: Output is json serializable and not prettified +Flat: Nested output is returned as a flattened dictionary

+
+
+
Variables
+

report_options – optional format changes to the report +dict(output_format=<FORMAT>)

+
+
Returns
+

dictionary report

+
+
Return type
+

dict

+
+
+
+
+
+update_profile(data, sample_size=None, min_true_samples=None)
+

Update the profile for data provided. User can specify the sample +size to profile the data with. Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples – minimum number of non-null samples to profile

  • +
+
+
+

:type min_true_samples +:return: None

+
+
+
+save(filepath=None)
+

Save profiler to disk

+
+
Parameters
+

filepath (String) – Path of file to save to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath)
+

Load profiler from disk

+
+
Parameters
+

filepath (String) – Path of file to load from

+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+
+class dataprofiler.profilers.profile_builder.UnstructuredProfiler(data, samples_per_update=None, min_true_samples=0, options=None)
+

Bases: dataprofiler.profilers.profile_builder.BaseProfiler

+

Instantiate the UnstructuredProfiler class

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

UnstructuredProfiler

+
+
+
+
+diff(other_profile, options=None)
+

Finds the difference between 2 unstuctured profiles and returns the +report.

+
+
Parameters
+
    +
  • other (UnstructuredProfiler) – profile finding the difference with this one.

  • +
  • options (dict) – options to impact the results of the diff

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+report(report_options=None)
+

Returns the unstructured report based on all profiled data fed into the +profiler. User can specify the output_formats: (pretty, compact, +serializable, flat).

+
+
+
Pretty: floats are rounded to four decimal places, and lists are

shortened.

+
+
Compact: Similar to pretty, but removes detailed statistics such as

runtimes, label probabilities, index locations of null types, +etc.

+
+
+

Serializable: Output is json serializable and not prettified +Flat: Nested output is returned as a flattened dictionary

+
+
+
Variables
+

report_options – optional format changes to the report +dict(output_format=<FORMAT>)

+
+
Returns
+

dictionary report

+
+
Return type
+

dict

+
+
+
+
+
+save(filepath=None)
+

Save profiler to disk

+
+
Parameters
+

filepath (String) – Path of file to save to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath)
+

Load profiler from disk

+
+
Parameters
+

filepath (String) – Path of file to load from

+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+property profile
+

Returns the stored profiles for the given profiler.

+
+
Returns
+

None

+
+
+
+
+
+update_profile(data, sample_size=None, min_true_samples=None)
+

Update the profile for data provided. User can specify the sample +size to profile the data with. Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples – minimum number of non-null samples to profile

  • +
+
+
+

:type min_true_samples +:return: None

+
+
+
+
+class dataprofiler.profilers.profile_builder.StructuredProfiler(data, samples_per_update=None, min_true_samples=0, options=None)
+

Bases: dataprofiler.profilers.profile_builder.BaseProfiler

+

Instantiate the StructuredProfiler class

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled

  • +
  • samples_per_update (int) – Number of samples to use in generating +profile

  • +
  • min_true_samples (int) – Minimum number of samples required for the +profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
+
+
Returns
+

StructuredProfiler

+
+
+
+
+diff(other_profile, options=None)
+

Finds the difference between 2 Profiles and returns the report

+
+
Parameters
+
    +
  • other (StructuredProfiler) – profile finding the difference with this one

  • +
  • options (dict) – options to change results of the difference

  • +
+
+
Returns
+

difference of the profiles

+
+
Return type
+

dict

+
+
+
+
+
+report(report_options=None)
+

Returns the profile report based on all profiled data fed into the +profiler. User can specify the output_formats: (pretty, compact, +serializable, flat).

+
+
+
Pretty: floats are rounded to four decimal places, and lists are

shortened.

+
+
Compact: Similar to pretty, but removes detailed statistics such as

runtimes, label probabilities, index locations of null types, +etc.

+
+
+

Serializable: Output is json serializable and not prettified +Flat: Nested output is returned as a flattened dictionary

+
+
+
Variables
+

report_options – optional format changes to the report +dict(output_format=<FORMAT>)

+
+
Returns
+

dictionary report

+
+
Return type
+

dict

+
+
+
+
+
+save(filepath=None)
+

Save profiler to disk

+
+
Parameters
+

filepath (String) – Path of file to save to

+
+
Returns
+

None

+
+
+
+
+
+classmethod load(filepath)
+

Load profiler from disk

+
+
Parameters
+

filepath (String) – Path of file to load from

+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+property profile
+

Returns the stored profiles for the given profiler.

+
+
Returns
+

None

+
+
+
+
+
+update_profile(data, sample_size=None, min_true_samples=None)
+

Update the profile for data provided. User can specify the sample +size to profile the data with. Additionally, the user can specify the +minimum number of non-null samples to profile.

+
+
Parameters
+
    +
  • data (Union[data_readers.base_data.BaseData, pandas.DataFrame, +pandas.Series]) – data to be profiled

  • +
  • sample_size (int) – number of samples to profile from the data

  • +
  • min_true_samples – minimum number of non-null samples to profile

  • +
+
+
+

:type min_true_samples +:return: None

+
+
+
+
+class dataprofiler.profilers.profile_builder.Profiler(data, samples_per_update=None, min_true_samples=0, options=None, profiler_type=None)
+

Bases: object

+

Factory class for instantiating Structured and Unstructured Profilers

+
+
Parameters
+
    +
  • data (Data class object) – Data to be profiled, type allowed depends on the +profiler_type

  • +
  • samples_per_update (int) – Number of samples to use to generate profile

  • +
  • min_true_samples (int) – Min number of samples required for the profiler

  • +
  • options (ProfilerOptions Object) – Options for the profiler.

  • +
  • profiler_type (str) – Type of Profiler (“structured”/”unstructured”)

  • +
+
+
Returns
+

BaseProfiler

+
+
+
+
+classmethod load(filepath)
+

Load profiler from disk

+
+
Parameters
+

filepath (String) – Path of file to load from

+
+
Returns
+

Profiler being loaded, StructuredProfiler or +UnstructuredProfiler

+
+
Return type
+

BaseProfiler

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.profiler_options.html b/docs/0.7.1/html/dataprofiler.profilers.profiler_options.html new file mode 100644 index 000000000..f94ab7edb --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.profiler_options.html @@ -0,0 +1,1711 @@ + + + + + + + + + Profiler Options - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profiler Options

+

coding=utf-8 +Specify the options when running the data profiler.

+
+
+class dataprofiler.profilers.profiler_options.BaseOption
+

Bases: object

+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.BooleanOption(is_enabled=True)
+

Bases: dataprofiler.profilers.profiler_options.BaseOption

+

Boolean option

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the option.

+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.HistogramOption(is_enabled=True, bin_count_or_method='auto')
+

Bases: dataprofiler.profilers.profiler_options.BooleanOption

+

Options for histograms

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • bin_count_or_method (Union[str, int, list(str)]) – bin count or the method with which to +calculate histograms

  • +
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.ModeOption(is_enabled=True, max_k_modes=5)
+

Bases: dataprofiler.profilers.profiler_options.BooleanOption

+

Options for mode estimation

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • top_k_modes (int) – the max number of modes to return, if applicable

  • +
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.BaseInspectorOptions(is_enabled=True)
+

Bases: dataprofiler.profilers.profiler_options.BooleanOption

+

Base options for all the columns.

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.NumericalOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Numerical Stats Mixin

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled
+

Returns the state of numeric stats being enabled / disabled. If any +numeric stats property is enabled it will return True, otherwise it +will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+property properties
+

Includes at least: +is_enabled: Turns on or off the column.

+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.IntOptions
+

Bases: dataprofiler.profilers.profiler_options.NumericalOptions

+

Options for the Int Column

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled
+

Returns the state of numeric stats being enabled / disabled. If any +numeric stats property is enabled it will return True, otherwise it +will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Includes at least: +is_enabled: Turns on or off the column.

+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.PrecisionOptions(is_enabled=True, sample_ratio=None)
+

Bases: dataprofiler.profilers.profiler_options.BooleanOption

+

Options for precision

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • sample_ratio (float) – float option to determine ratio of valid +float samples in determining percision. +This ratio will override any defaults.

  • +
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.FloatOptions
+

Bases: dataprofiler.profilers.profiler_options.NumericalOptions

+

Options for the Float Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
  • histogram_and_quantiles (BooleanOption) – boolean option to enable/disable +histogram_and_quantiles

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias: BooleanOption +:ivar num_zeros: boolean option to enable/disable num_zeros +:vartype num_zeros: BooleanOption +:ivar num_negatives: boolean option to enable/disable num_negatives +:vartype num_negatives: BooleanOption +:ivar is_numeric_stats_enabled: boolean to enable/disable all numeric

+
+

stats

+
+
+
+
+
+property is_numeric_stats_enabled
+

Returns the state of numeric stats being enabled / disabled. If any +numeric stats property is enabled it will return True, otherwise it +will return False.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Includes at least: +is_enabled: Turns on or off the column.

+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.TextOptions
+

Bases: dataprofiler.profilers.profiler_options.NumericalOptions

+

Options for the Text Column:

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • vocab (BooleanOption) – boolean option to enable/disable vocab

  • +
  • min (BooleanOption) – boolean option to enable/disable min

  • +
  • max (BooleanOption) – boolean option to enable/disable max

  • +
  • mode (ModeOption) – option to enable/disable mode and set return count

  • +
  • sum (BooleanOption) – boolean option to enable/disable sum

  • +
  • variance (BooleanOption) – boolean option to enable/disable variance

  • +
  • skewness (BooleanOption) – boolean option to enable/disable skewness

  • +
  • kurtosis (BooleanOption) – boolean option to enable/disable kurtosis

  • +
+
+
+

:ivar bias_correction : boolean option to enable/disable existence of bias +:vartype bias: BooleanOption +:ivar histogram_and_quantiles: boolean option to enable/disable

+
+

histogram_and_quantiles

+
+
+
Variables
+
    +
  • num_zeros (BooleanOption) – boolean option to enable/disable num_zeros

  • +
  • num_negatives (BooleanOption) – boolean option to enable/disable num_negatives

  • +
  • is_numeric_stats_enabled (bool) – boolean to enable/disable all numeric +stats

  • +
+
+
+
+
+property is_numeric_stats_enabled
+

Returns the state of numeric stats being enabled / disabled. If any +numeric stats property is enabled it will return True, otherwise it +will return False. Although it seems redundant, this method is needed +in order for the function below, the setter function +also called is_numeric_stats_enabled, to properly work.

+
+
Returns
+

true if any numeric stats property is enabled, otherwise false

+
+
Rtype bool
+

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Includes at least: +is_enabled: Turns on or off the column.

+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.DateTimeOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Datetime Column

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.OrderOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Order Column

+
+
Variables
+

is_enabled (bool) – boolean option to enable/disable the column.

+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.CategoricalOptions(is_enabled=True, top_k_categories=None)
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Categorical Column

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • top_k_categories ([None, int]) – number of categories to be displayed when called

  • +
+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.CorrelationOptions(is_enabled=False, columns=None)
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Correlation between Columns

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable.

  • +
  • columns (list()) – Columns considered to calculate correlation

  • +
+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.DataLabelerOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Options for the Data Labeler Column.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the column.

  • +
  • data_labeler_dirpath (str) – String to load data labeler from

  • +
  • max_sample_size (BaseDataLabeler) – Int to decide sample size

  • +
  • data_labeler_object – DataLabeler object used in profiler

  • +
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.TextProfilerOptions(is_enabled=True, is_case_sensitive=True, stop_words=None, top_k_chars=None, top_k_words=None)
+

Bases: dataprofiler.profilers.profiler_options.BaseInspectorOptions

+

Constructs the TextProfilerOption object with default values.

+
+
Variables
+
    +
  • is_enabled (bool) – boolean option to enable/disable the option.

  • +
  • is_case_sensitive (bool) – option set for case sensitivity.

  • +
  • stop_words (Union[None, list(str)]) – option set for stop words.

  • +
  • top_k_chars (Union[None, int]) – option set for number of top common characters.

  • +
  • top_k_words (Union[None, int]) – option set for number of top common words.

  • +
  • words (BooleanOption) – option set for word update.

  • +
  • vocab (BooleanOption) – option set for vocab update.

  • +
+
+
+
+
+is_prop_enabled(prop)
+

Checks to see if a property is enabled or not and returns boolean.

+
+
Parameters
+

prop (String) – The option to check if it is enabled

+
+
Returns
+

Whether or not the property is enabled

+
+
Return type
+

Boolean

+
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.StructuredOptions(null_values=None)
+

Bases: dataprofiler.profilers.profiler_options.BaseOption

+

Constructs the StructuredOptions object with default values.

+
+
Parameters
+

null_values – null values we input.

+
+
Variables
+
    +
  • int (IntOptions) – option set for int profiling.

  • +
  • float (FloatOptions) – option set for float profiling.

  • +
  • datetime (DateTimeOptions) – option set for datetime profiling.

  • +
  • text (TextOptions) – option set for text profiling.

  • +
  • order (OrderOptions) – option set for order profiling.

  • +
  • category (CategoricalOptions) – option set for category profiling.

  • +
  • data_labeler (DataLabelerOptions) – option set for data_labeler profiling.

  • +
  • correlation (CorrelationOptions) – option set for correlation profiling.

  • +
  • null_values (Union[None, dict]) – option set for defined null values

  • +
+
+
+
+
+property enabled_profiles
+

Returns a list of the enabled profilers for columns.

+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.UnstructuredOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseOption

+

Constructs the UnstructuredOptions object with default values.

+
+
Variables
+
+
+
+
+
+property enabled_profiles
+

Returns a list of the enabled profilers.

+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+set(options)
+

Set all the options. Send in a dict that contains all of or a subset of +the appropriate options. Set the values of the options. Will raise error +if the formatting is improper.

+
+
Parameters
+

options (dict) – dict containing the options you want to set.

+
+
Returns
+

None

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+
+class dataprofiler.profilers.profiler_options.ProfilerOptions
+

Bases: dataprofiler.profilers.profiler_options.BaseOption

+

Initializes the ProfilerOptions object.

+
+
Variables
+
    +
  • structured_options (StructuredOptions) – option set for structured dataset profiling.

  • +
  • unstructured_options (UnstructuredOptions) – option set for unstructured dataset profiling.

  • +
+
+
+
+
+property properties
+

Returns a copy of the option properties.

+
+
Returns
+

dictionary of the option’s properties attr: value

+
+
Return type
+

dict

+
+
+
+
+
+validate(raise_error=True)
+

Validates the options do not conflict and cause errors. Raises +error/warning if so.

+
+
Parameters
+

raise_error (bool) – Flag that raises errors if true. Returns errors if +false.

+
+
Returns
+

list of errors (if raise_error is false)

+
+
Return type
+

list(str)

+
+
+
+
+
+set(options)
+

Overwrites BaseOption.set since the type (unstructured/structured) may +need to be specified if the same options exist within both +self.structured_options and self.unstructured_options

+
+
Parameters
+

options (dict) – Dictionary of options to set

+
+
Return
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.text_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.text_column_profile.html new file mode 100644 index 000000000..7e9c3d78f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.text_column_profile.html @@ -0,0 +1,434 @@ + + + + + + + + + Text Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Text Column Profile

+
+
+class dataprofiler.profilers.text_column_profile.TextColumn(name, options=None)
+

Bases: dataprofiler.profilers.numerical_column_stats.NumericStatsMixin, dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler

+

Text column profile subclass of BaseColumnProfiler. Represents a column in +the dataset which is a text column.

+

Initialization of column base properties and itself.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (TextOptions) – Options for the Text column

  • +
+
+
+
+
+type = 'text'
+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
Returns
+

+
+
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for text columns

+
+
Parameters
+

other_profile (TextColumn Profile) – profile to find the difference with

+
+
Returns
+

the text columns differences

+
+
Return type
+

dict

+
+
+
+
+
+property data_type_ratio
+

Calculates the ratio of samples which match this data type. +NOTE: all values can be considered string so always returns 1 in this +case.

+
+
Returns
+

ratio of data type

+
+
Return type
+

float

+
+
+
+
+
+update(df_series)
+

Updates the column profile.

+
+
Parameters
+

df_series (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+col_type = None
+
+
+
+static is_float(x)
+

For “0.80” this function returns True +For “1.00” this function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is float or not

+
+
Return type
+

bool

+
+
+
+
+
+static is_int(x)
+

For “0.80” This function returns False +For “1.00” This function returns True +For “1” this function returns True

+
+
Parameters
+

x (str) – string to test

+
+
Returns
+

if is integer or not

+
+
Return type
+

bool

+
+
+
+
+
+property kurtosis
+
+
+
+property mean
+
+
+
+property mode
+

Finds an estimate for the mode(s) of the data.

+
+
Returns
+

the mode(s) of the data

+
+
Return type
+

list(float)

+
+
+
+
+
+static np_type_to_type(val)
+

Converts numpy variables to base python type variables

+
+
Parameters
+

val (numpy type or base type) – value to check & change

+
+
Return val
+

base python type

+
+
Rtype val
+

int or float

+
+
+
+
+
+property skewness
+
+
+
+property stddev
+
+
+
+property variance
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.unstructured_data_labeler_column_profile.html b/docs/0.7.1/html/dataprofiler.profilers.unstructured_data_labeler_column_profile.html new file mode 100644 index 000000000..2c6516a0a --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.unstructured_data_labeler_column_profile.html @@ -0,0 +1,253 @@ + + + + + + + + + Unstructured Data Labeler Column Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.unstructured_labeler_profile.html b/docs/0.7.1/html/dataprofiler.profilers.unstructured_labeler_profile.html new file mode 100644 index 000000000..e2db2c1dc --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.unstructured_labeler_profile.html @@ -0,0 +1,320 @@ + + + + + + + + + Unstructured Labeler Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Unstructured Labeler Profile

+
+
+class dataprofiler.profilers.unstructured_labeler_profile.UnstructuredLabelerProfile(data_labeler_dirpath=None, options=None)
+

Bases: object

+

Initialization of Data Label profiling for unstructured datasets.

+
+
Parameters
+
    +
  • data_labeler_dirpath (String) – Directory path to the data labeler

  • +
  • options (DataLabelerOptions) – Options for the data labeler column

  • +
+
+
+
+
+type = 'data_labeler'
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for two unstructured labeler profiles

+
+
Parameters
+
    +
  • other_profile (UnstructuredLabelerProfile) – profile to find the difference with

  • +
  • options (dict) – options for diff output

  • +
+
+
Returns
+

the difference between entity counts/percentages

+
+
Return type
+

dict

+
+
+
+
+
+property label_encoding
+
+
+
+update(df_series)
+
+
+
+property profile
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.unstructured_text_profile.html b/docs/0.7.1/html/dataprofiler.profilers.unstructured_text_profile.html new file mode 100644 index 000000000..800740758 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.unstructured_text_profile.html @@ -0,0 +1,331 @@ + + + + + + + + + Unstructured Text Profile - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Unstructured Text Profile

+
+
+class dataprofiler.profilers.unstructured_text_profile.TextProfiler(name, options=None)
+

Bases: object

+

Initialization of Text Profiler.

+
+
Parameters
+
    +
  • name (String) – Name of the data

  • +
  • options (UnstructuredTextOptions) – Options for the Text Profiler

  • +
+
+
+
+
+type = 'text'
+
+
+
+diff(other_profile, options=None)
+

Finds the differences for two unstructured text profiles

+
+
Parameters
+
    +
  • other_profile (TextProfiler) – profile to find the difference with

  • +
  • options (dict) – options for diff output

  • +
+
+
Returns
+

the difference between profiles

+
+
Return type
+

dict

+
+
+
+
+
+property profile
+

Property for profile. Returns the profile of the column.

+
+
Returns
+

+
+
+
+
+
+update(data)
+

Updates the column profile.

+
+
Parameters
+

data (pandas.core.series.Series) – df series

+
+
Returns
+

None

+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.profilers.utils.html b/docs/0.7.1/html/dataprofiler.profilers.utils.html new file mode 100644 index 000000000..b5c42e134 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.profilers.utils.html @@ -0,0 +1,681 @@ + + + + + + + + + Utils - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Utils

+
+
+dataprofiler.profilers.utils.dict_merge(dct, merge_dct)
+

Recursive dict merge. Inspired by :meth:dict.update(), instead of +updating only top-level keys, dict_merge recurses down into dicts nested +to an arbitrary depth, updating keys. The merge_dct is merged into +dct.

+
+
Parameters
+
    +
  • dct – dict onto which the merge is executed

  • +
  • merge_dct – dct merged into dct

  • +
+
+
Returns
+

None

+
+
+
+
+
+class dataprofiler.profilers.utils.KeyDict
+

Bases: collections.defaultdict

+

Helper class for sample_in_chunks. Allows keys that are missing to become +the values for that key. +From: +https://www.drmaciver.com/2018/01/lazy-fisher-yates-shuffling-for-precise-rejection-sampling/

+
+
+clear() None.  Remove all items from D.
+
+
+
+copy() a shallow copy of D.
+
+
+
+default_factory
+

Factory for default value called by __missing__().

+
+
+
+fromkeys(value=None, /)
+

Create a new dictionary with keys from iterable and values set to value.

+
+
+
+get(key, default=None, /)
+

Return the value for key if key is in the dictionary, else default.

+
+
+
+items() a set-like object providing a view on D’s items
+
+
+
+keys() a set-like object providing a view on D’s keys
+
+
+
+pop(k[, d]) v, remove specified key and return the corresponding value.
+

If key is not found, d is returned if given, otherwise KeyError is raised

+
+
+
+popitem()
+

Remove and return a (key, value) pair as a 2-tuple.

+

Pairs are returned in LIFO (last-in, first-out) order. +Raises KeyError if the dict is empty.

+
+
+
+setdefault(key, default=None, /)
+

Insert key with a value of default if key is not in the dictionary.

+

Return the value for key if key is in the dictionary, else default.

+
+
+
+update([E, ]**F) None.  Update D from dict/iterable E and F.
+

If E is present and has a .keys() method, then does: for k in E: D[k] = E[k] +If E is present and lacks a .keys() method, then does: for k, v in E: D[k] = v +In either case, this is followed by: for k in F: D[k] = F[k]

+
+
+
+values() an object providing a view on D’s values
+
+
+
+
+dataprofiler.profilers.utils.shuffle_in_chunks(data_length, chunk_size)
+

A generator for creating shuffled indexes in chunks. This reduces the cost +of having to create all indexes, but only of that what is needed. +Initial Code idea from: +https://www.drmaciver.com/2018/01/lazy-fisher-yates-shuffling-for-precise-rejection-sampling/

+
+
Parameters
+
    +
  • data_length – length of data to be shuffled

  • +
  • chunk_size – size of shuffled chunks

  • +
+
+
Returns
+

list of shuffled indices of chunk size

+
+
+
+
+
+dataprofiler.profilers.utils.warn_on_profile(col_profile, e)
+

Returns a warning if a given profile errors (tensorflow typcially)

+
+
Parameters
+
    +
  • col_profile (str) – Name of the column profile

  • +
  • e (Exception) – Error message from profiler error

  • +
+
+
+
+
+
+dataprofiler.profilers.utils.partition(data, chunk_size)
+

Creates a generator which returns the data +in the specified chunk size.

+
+
Parameters
+
    +
  • data (list, dataframe, etc) – list, dataframe, etc

  • +
  • chunk_size (int) – size of partition to return

  • +
+
+
+
+
+
+dataprofiler.profilers.utils.suggest_pool_size(data_size=None, cols=None)
+

Suggest the pool size based on resources

+
+
Parameters
+
    +
  • data_size (int) – size of the dataset

  • +
  • cols (int) – columns of the dataset

  • +
+
+
Return suggested_pool_size
+

suggeseted pool size

+
+
Rtype suggested_pool_size
+

int

+
+
+
+
+
+dataprofiler.profilers.utils.generate_pool(max_pool_size=None, data_size=None, cols=None)
+

Generate a multiprocessing pool to allocate functions too

+
+
Parameters
+
    +
  • max_pool_size (Union[int, None]) – Max number of processes assigned to the pool

  • +
  • data_size (int) – size of the dataset

  • +
  • cols (int) – columns of the dataset

  • +
+
+
Return pool
+

Multiprocessing pool to allocate processes to

+
+
Rtype pool
+

Multiproessing.Pool

+
+
Return cpu_count
+

Number of processes (cpu bound) to utilize

+
+
Rtype cpu_count
+

int

+
+
+
+
+
+dataprofiler.profilers.utils.overlap(x1, x2, y1, y2)
+

Return True iff [x1:x2] overlaps with [y1:y2]

+
+
+
+dataprofiler.profilers.utils.add_nested_dictionaries(first_dict, second_dict)
+

Merges two dictionaries together and adds values together

+
+
Parameters
+
    +
  • first_dict (dict) – dictionary to be merged

  • +
  • second_dict (dict) – dictionary to be merged

  • +
+
+
Returns
+

merged dictionary

+
+
+
+
+
+dataprofiler.profilers.utils.biased_skew(df_series)
+

Calculates the biased estimator for skewness of the given data. +The definition is formalized as g_1 here:

+
+
+
+
Parameters
+

df_series (pandas Series) – data to get skewness of, assuming floats

+
+
Returns
+

biased skewness

+
+
Return type
+

float

+
+
+
+
+
+dataprofiler.profilers.utils.biased_kurt(df_series)
+

Calculates the biased estimator for kurtosis of the given data +The definition is formalized as g_2 here:

+
+
+
+
Parameters
+

df_series (pandas Series) – data to get kurtosis of, assuming floats

+
+
Returns
+

biased kurtosis

+
+
Return type
+

float

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_numbers(stat1, stat2)
+

Finds the difference between two stats. If there is no difference, returns +“unchanged”. For ints/floats, returns stat1 - stat2.

+
+
Parameters
+
    +
  • stat1 (Union[int, float]) – the first statistical input

  • +
  • stat2 (Union[int, float]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_strings_and_bools(stat1, stat2)
+

Finds the difference between two stats. If there is no difference, returns +“unchanged”. For strings and bools, returns list containing [stat1, stat2].

+
+
Parameters
+
    +
  • stat1 (Union[str, bool]) – the first statistical input

  • +
  • stat2 (Union[str, bool]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_lists_and_sets(stat1, stat2)
+

Finds the difference between two stats. If there is no difference, returns +“unchanged”. Removes duplicates and returns [unique values of stat1, +shared values, unique values of stat2].

+
+
Parameters
+
    +
  • stat1 (Union[list, set]) – the first statistical input

  • +
  • stat2 (Union[list, set]) – the second statistical input

  • +
+
+
Returns
+

the difference of the stats

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_dates(stat1, stat2)
+

Finds the difference between two dates. If there is no difference, returns +“unchanged”. For dates, returns the difference in time.

+

Because only days can be stored as negative values internally +for timedelta objects, the output for these negative values is +less readable due to the combination of signs in the default +output. This returns a readable output for timedelta that +accounts for potential negative differences.

+
+
Parameters
+
    +
  • stat1 (datetime.datetime object) – the first statistical input

  • +
  • stat2 (datetime.datetime object) – the second statistical input

  • +
+
+
Returns
+

Difference in stats

+
+
Return type
+

str

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_dicts(dict1, dict2)
+

Finds the difference between two dicts. For each key in each dict, +returns “unchanged” if there’s no difference, otherwise returns +the difference. Assumes that if the two dictionaries share the +same key, their values are the same type.

+
+
Parameters
+
    +
  • dict1 (dict) – the first dict

  • +
  • dict2 (dict) – the second dict

  • +
+
+
Returns
+

Difference in the keys of each dict

+
+
Return type
+

dict

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_matrices(matrix1, matrix2)
+

Finds the difference between two matrices.

+
+
Parameters
+
    +
  • matrix1 (list(list(float))) – the first matrix

  • +
  • matrix2 (list(list(float))) – the second matrix

  • +
+
+
Returns
+

Difference in the matrix

+
+
Return type
+

list(list(float))

+
+
+
+
+
+dataprofiler.profilers.utils.find_diff_of_dicts_with_diff_keys(dict1, dict2)
+

Finds the difference between two dicts. For each key in each dict, +returns “unchanged” if there’s no difference, otherwise returns +the difference. Assumes that if the two dictionaries share the +same key, their values are the same type.

+
+
Parameters
+
    +
  • dict1 (dict) – the first dict

  • +
  • dict2 (dict) – the second dict

  • +
+
+
Returns
+

Difference in the keys of each dict

+
+
Return type
+

list

+
+
+
+
+
+dataprofiler.profilers.utils.get_memory_size(data, unit='M')
+

Get memory size of the input data

+
+
Parameters
+
    +
  • data (Union[list, numpy.array, pandas.DataFrame]) – list or array of data

  • +
  • unit (string) – memory size unit (B, K, M, or G)

  • +
+
+
Returns
+

memory size of the input data

+
+
+
+
+
+dataprofiler.profilers.utils.method_timeit(method=None, name=None)
+

Measure execution time of provided method +Records time into times dictionary

+
+
Parameters
+
    +
  • method (Callable) – method to time

  • +
  • name (str) – key argument for the times dictionary

  • +
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.reports.graphs.html b/docs/0.7.1/html/dataprofiler.reports.graphs.html new file mode 100644 index 000000000..f764fcaf7 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.reports.graphs.html @@ -0,0 +1,291 @@ + + + + + + + + + Graphs - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graphs

+
+
+dataprofiler.reports.graphs.plot_histograms(profiler, columns=None)
+

Take a input of StructuredProfiler class and a list of specified column +names and then plots the histograms of those that are int or float +columns.

+
+
Parameters
+
    +
  • profiler (StructuredProfiler) – StructuredProfiler variable

  • +
  • columns (list) – list of column names to be plotted

  • +
+
+
Returns
+

returns fig

+
+
Return type
+

fig

+
+
+
+
+
+dataprofiler.reports.graphs.plot_col_histogram(data_type_profiler, ax=None, title=None)
+

Take a input of a Int or Float Column and plots the histogram

+
+
Parameters
+
    +
  • data_type_profiler (Union[IntColumn, FloatColumn]) – the Int or Float column we pass in

  • +
  • ax (list) – ax as in seaborn ax

  • +
  • title (str) – name of a individual histogram

  • +
+
+
Returns
+

ax

+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.reports.html b/docs/0.7.1/html/dataprofiler.reports.html new file mode 100644 index 000000000..87c2379a3 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.reports.html @@ -0,0 +1,284 @@ + + + + + + + + + Reports - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Reports

+
+

Modules

+
+
+
+ +
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.settings.html b/docs/0.7.1/html/dataprofiler.settings.html new file mode 100644 index 000000000..5ed72917c --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.settings.html @@ -0,0 +1,253 @@ + + + + + + + + + Settings - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.validators.base_validators.html b/docs/0.7.1/html/dataprofiler.validators.base_validators.html new file mode 100644 index 000000000..3a7dbc35d --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.validators.base_validators.html @@ -0,0 +1,340 @@ + + + + + + + + + Base Validators - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Base Validators

+

coding=utf-8

+

Build model for a dataset by identifying type of column along with its +respective parameters.

+
+
+dataprofiler.validators.base_validators.is_in_range(x, config)
+

Checks to see x is in the range of the config.

+
+
Parameters
+
    +
  • x (int/float) – number

  • +
  • config (dict) – configuration

  • +
+
+
Returns
+

bool

+
+
+
+
+
+dataprofiler.validators.base_validators.is_in_list(x, config)
+

Checks to see x is in the config list.

+
+
Parameters
+
    +
  • x (string) – item

  • +
  • config (dict) – configuration

  • +
+
+
Returns
+

bool

+
+
+
+
+
+class dataprofiler.validators.base_validators.Validator
+

Bases: object

+
+
+validate(data, config)
+

Validate a data set. No option for validating a partial data set.

+

Set configuration on run not on instantiation of the class such that +you have the option to run multiple times with different configurations +without having to also reinstantiate the class.

+
+
Parameters
+
    +
  • data (DataFrame Dask/Pandas) – The data to be processed by the validator. Processing occurs in a column-wise fashion.

  • +
  • config (dict) – configuration for how the validator should +run across the given data. Validator will only run over columns +specified in the configuration.

  • +
+
+
Example
+

This is an example of the config:

+
config = {
+        <column_name>: {
+                range: {
+                    'start': 1,
+                    'end':2
+                },
+                list: [1,2,3]
+            }
+        }
+
+
+
+
+
+
+
+get()
+

Get the results of the validation run.

+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.validators.html b/docs/0.7.1/html/dataprofiler.validators.html new file mode 100644 index 000000000..6d8f6842f --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.validators.html @@ -0,0 +1,302 @@ + + + + + + + + + Validators - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/dataprofiler.version.html b/docs/0.7.1/html/dataprofiler.version.html new file mode 100644 index 000000000..4f5dc1270 --- /dev/null +++ b/docs/0.7.1/html/dataprofiler.version.html @@ -0,0 +1,254 @@ + + + + + + + + + Version - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/examples.html b/docs/0.7.1/html/examples.html new file mode 100644 index 000000000..8f0810d5a --- /dev/null +++ b/docs/0.7.1/html/examples.html @@ -0,0 +1,359 @@ + + + + + + + + + Examples - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Examples

+

These examples provide a more in-depth look into the details of the Data Profiler library.

+
+

Basics

+ +
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/genindex.html b/docs/0.7.1/html/genindex.html new file mode 100644 index 000000000..1b978b1aa --- /dev/null +++ b/docs/0.7.1/html/genindex.html @@ -0,0 +1,2746 @@ + + + + + + + Index - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Index

+
A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W
+
+
+

A

+ + + +
+
+ +
+

B

+ + + +
+
+ +
+

C

+ + + +
+
+ +
+

D

+ + + +
+
+ +
+

E

+ + + +
+
+ +
+

F

+ + + +
+
+ +
+

G

+ + + +
+
+ +
+

H

+ + + +
+
+ +
+

I

+ + + +
+
+ +
+

J

+ + + +
+
+ +
+

K

+ + +
+
+ +
+

L

+ + + +
+
+ +
+

M

+ + + +
+
+ +
+

N

+ + + +
+
+ +
+

O

+ + + +
+
+ +
+

P

+ + + +
+
+ +
+

Q

+ + +
+
+ +
+

R

+ + + +
+
+ +
+

S

+ + + +
+
+ +
+

T

+ + + +
+
+ +
+

U

+ + + +
+
+ +
+

V

+ + + +
+
+ +
+

W

+ + + +
+
+ + +
+
+ + + + + +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/graphs.html b/docs/0.7.1/html/graphs.html new file mode 100644 index 000000000..712b4e097 --- /dev/null +++ b/docs/0.7.1/html/graphs.html @@ -0,0 +1,396 @@ + + + + + + + + + Graphs - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Graphs

+
+

Graph Your Data

+

We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples.

+

The following plots are currently available to work directly with your profilers:

+
+
    +
  • histogram (numeric columns only)

  • +
+
+

Below shows how to do so with examples.

+
+

What we need to import

+
from dataprofiler.reports import graphs
+
+
+

The main functions that is used to plot histograms are in graphs. You will also need the `dataprofiler[reports]` requirement to be installed:

+
pip install 'dataprofiler[reports]'
+
+
+
+
+

Plotting from a StructuredProfiler class

+

With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms.

+
graphs.plot_histograms(profiler, columns)
+
+
+

These are what the variables mean:

+
+
    +
  • profiler - StructuredProfiler class variable that contains the data we want

  • +
  • columns - (Optional) The list of IntColumn or FloatColumn we want to specifically plot.

  • +
+
+
+
+

Plotting an individual IntColumn or FloatColumn

+

Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work.

+
graphs.plot_col_histogram(column, axes, title)
+
+
+

These are what the variables mean:

+
+
    +
  • column - The IntColumn or FloatColumn we want to plot

  • +
  • axes - (Optional) The axes we want to specify.

  • +
  • title - (Optional) The title of the plot we want to define.

  • +
+
+
+
+

Examples

+
    +
  1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns.

  2. +
+
import dataprofiler as dp
+from dataprofiler.reports import graphs
+
+
+data = [[1, 'a', 1.0],
+        [2, 'b', 2.2],
+        [3, 'c', 3.5],
+        [None, 'd', 10.0]]
+profiler = dp.StructuredProfiler(data)
+
+# This will plot all IntColumn and FloatColumn as histograms (The first and last column).
+fig = graphs.plot_histograms(profiler)
+fig.show()
+
+# This will only plot the specified column, 0.
+columns = [0]
+fig = graphs.plot_histograms(profiler, columns)
+fig.show()
+
+
+First Histogram Example Image +Second Histogram Example Image +
    +
  1. This example demonstrates how we can plot a low level profiler.

  2. +
+
import pandas as pd
+
+from dataprofiler.profilers import IntColumn
+from dataprofiler.reports import graphs
+
+
+data = pd.Series([1, 2, 3], dtype=str)
+profiler = IntColumn('example')
+profiler.update(data)
+
+# Plot the axes
+ax = graphs.plot_col_histogram(profiler)
+
+# get and show the figure of the plotted histogram
+fig = ax.get_figure()
+fig.show()
+
+
+Histogram Column Only Example Image +
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/index.html b/docs/0.7.1/html/index.html new file mode 100644 index 000000000..4477d3997 --- /dev/null +++ b/docs/0.7.1/html/index.html @@ -0,0 +1,659 @@ + + + + + + + + + Home - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Data Profiler | What’s in your data?

+
+

Purpose

+

The DataProfiler is a Python library designed to make data analysis, monitoring and sensitive data detection easy.

+

Loading Data with a single command, the library automatically formats & loads files into a DataFrame. Profiling the Data, the library identifies the schema, statistics, entities and more. Data Profiles can then be used in downstream applications or reports.

+

The Data Profiler comes with a cutting edge pre-trained deep learning model, used to efficiently identify sensitive data (or PII). If customization is needed, it’s easy to add new entities to the existing pre-trained model or insert a new pipeline for entity recognition.

+

The best part? Getting started only takes a few lines of code (Example CSV):

+
import json
+from dataprofiler import Data, Profiler
+
+data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text
+print(data.data.head(5)) # Access data directly via a compatible Pandas DataFrame
+
+profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc
+readable_report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(readable_report, indent=4))
+
+
+

To install the full package from pypi:

+
pip install DataProfiler[ml]
+
+
+

If the ML requirements are too strict (say, you don’t want to install tensorflow), you can install a slimmer package. The slimmer package disables the default sensitive data detection / entity recognition (labler)

+

Install from pypi:

+
pip install DataProfiler
+
+
+

If you have suggestions or find a bug, please open an issue.

+

Visit the API to explore Data Profiler’s terminology.

+
+
+

What is a Data Profile?

+

In the case of this library, a data profile is a dictionary containing statistics and predictions about the underlying dataset. There are “global statistics” or global_stats, which contain dataset level data and there are “column/row level statistics” or data_stats (each column is a new key-value entry).

+

The format for a structured profile is below:

+
"global_stats": {
+    "samples_used": int,
+    "column_count": int,
+    "row_count": int,
+    "row_has_null_ratio": float,
+    "row_is_null_ratio": float,
+    "unique_row_ratio": float,
+    "duplicate_row_count": int,
+    "file_type": string,
+    "encoding": string,
+    "correlation_matrix": list(list(int)), (*)
+    "profile_schema": dict[string, list(int)]
+},
+"data_stats": [
+    {
+        "column_name": string,
+        "data_type": string,
+        "data_label": string,
+        "categorical": bool,
+        "order": string,
+        "samples": list(str),
+        "statistics": {
+            "sample_size": int,
+            "null_count": int,
+            "null_types": list(string),
+            "null_types_index": dict[string, list(int)],
+            "data_type_representation": dict[string, list(string)],
+            "min": [null, float],
+            "max": [null, float],
+            "sum": float,
+            "mean": float,
+            "variance": float,
+            "stddev": float,
+            "skewness": float,
+            "kurtosis": float,
+            "num_zeros": int,
+            "num_negatives": int,
+            "histogram": {
+                "bin_counts": list(int),
+                "bin_edges": list(float),
+            },
+            "quantiles": {
+                int: float
+            },
+            "vocab": list(char),
+            "avg_predictions": dict[string, float],
+            "data_label_representation": dict[string, float],
+            "categories": list(str),
+            "unique_count": int,
+            "unique_ratio": float,
+            "categorical_count": dict[string, int],
+            "gini_impurity": float,
+            "unalikeability": float,
+            "precision": {
+                'min': int,
+                'max': int,
+                'mean': float,
+                'var': float,
+                'std': float,
+                'sample_size': int,
+                'margin_of_error': float,
+                'confidence_level': float
+            },
+            "times": dict[string, float],
+            "format": string
+        }
+    }
+]
+
+
+

(*) Currently the correlation matrix update is toggled off. It will be reset in a later update. Users can still use it as desired with the is_enable option set to True.

+

The format for an unstructured profile is below:

+
"global_stats": {
+    "samples_used": int,
+    "empty_line_count": int,
+    "file_type": string,
+    "encoding": string,
+    "memory_size": float, # in MB
+},
+"data_stats": {
+    "data_label": {
+        "entity_counts": {
+            "word_level": dict[string, int],
+            "true_char_level": dict[string, int],
+            "postprocess_char_level": dict[string, int]
+        },
+        "entity_percentages": {
+            "word_level": dict[string, float],
+            "true_char_level": dict[string, float],
+            "postprocess_char_level": dict[string, float]
+        },
+        "times": dict[string, float]
+    },
+    "statistics": {
+        "vocab": list(char),
+        "vocab_count": dict[string, int],
+        "words": list(string),
+        "word_count": dict[string, int],
+        "times": dict[string, float]
+    }
+}
+
+
+
+

Supported Data Formats

+
    +
  • Any delimited file (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro file

  • +
  • Parquet file

  • +
  • Text file

  • +
  • Pandas DataFrame

  • +
  • A URL that points to one of the supported file types above

  • +
+
+
+

Data Labels

+

Data Labels are determined per cell for structured data (column/row when the profiler is used) or at the character level for unstructured data.

+
    +
  • UNKNOWN

  • +
  • ADDRESS

  • +
  • BAN (bank account number, 10-18 digits)

  • +
  • CREDIT_CARD

  • +
  • EMAIL_ADDRESS

  • +
  • UUID

  • +
  • HASH_OR_KEY (md5, sha1, sha256, random hash, etc.)

  • +
  • IPV4

  • +
  • IPV6

  • +
  • MAC_ADDRESS

  • +
  • PERSON

  • +
  • PHONE_NUMBER

  • +
  • SSN

  • +
  • URL

  • +
  • US_STATE

  • +
  • DRIVERS_LICENSE

  • +
  • DATE

  • +
  • TIME

  • +
  • DATETIME

  • +
  • INTEGER

  • +
  • FLOAT

  • +
  • QUANTITY

  • +
  • ORDINAL

  • +
+
+
+
+

Get Started

+
+

Load a File

+

The profiler should automatically identify the file type and load the data into a Data Class.

+

Along with other attributtes the Data class enables structured data to be accessed via a valid Pandas DataFrame.

+
# Load a csv file, return a CSVData object
+csv_data = Data('your_file.csv')
+
+# Print the first 10 rows of the csv file
+print(csv_data.data.head(10))
+
+# Load a parquet file, return a ParquetData object
+parquet_data = Data('your_file.parquet')
+
+# Sort the data by the name column
+parquet_data.data.sort_values(by='name', inplace=True)
+
+# Print the sorted first 10 rows of the parquet data
+print(parquet_data.data.head(10))
+
+
+

If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers.

+
+
+

Profile a File

+

Example uses a CSV file for example, but CSV, JSON, Avro, Parquet or Text should also work.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load file (CSV should be automatically identified)
+data = Data("your_file.csv")
+
+# Profile the dataset
+profile = Profiler(data)
+
+# Generate a report and use json to prettify.
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+

Updating Profiles

+

Currently, the data profiler is equipped to update its profile in batches.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile = Profiler(data)
+
+# Update the profile with new data:
+new_data = Data("new_data.csv")
+profile.update_profile(new_data)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Merging Profiles

+

If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator.

+

This also enables profiles to be determined in a distributed manner.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file with a schema
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file with the same schema
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+profile3 = profile1 + profile2
+
+# Print the report using json to prettify.
+report  = profile3.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Profile a Pandas DataFrame

+
import pandas as pd
+import dataprofiler as dp
+import json
+
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
+profile = dp.Profiler(my_dataframe)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+# read a specified column, in this case it is labeled 0:
+print(json.dumps(report["data stats"][0], indent=4))
+
+
+
+
+

Unstructured Profiler

+

In addition to the structured profiler, the Data Profiler provides unstructured +profiling for the TextData object or string. Unstructured profiling also works +with list(string), pd.Series(string) or pd.DataFrame(string) given profiler_type +option specified as unstructured. Below is an example of unstructured profile +with a text file.

+
import dataprofiler as dp
+import json
+my_text = dp.Data('text_file.txt')
+profile = dp.Profiler(my_text)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+

Another example of unstructured profile with pd.Series of string is given as below

+
import dataprofiler as dp
+import pandas as pd
+import json
+
+text_data = pd.Series(['first string', 'second string'])
+profile = dp.Profiler(text_data, profiler_type="unstructured")
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Specifying a Filetype or Delimiter

+

Example of specifying a CSV data type, with a , delimiter. +In addition, it utilizes only the first 10,000 rows.

+
import json
+import os
+from dataprofiler import Data, Profiler
+from dataprofiler.data_readers.csv_data import CSVData
+
+# Load a CSV file, with "," as the delimiter
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+# Split the data, such that only the first 10,000 rows are used
+data = data.data[0:10000]
+
+# Read in profile and print results
+profile = Profiler(data)
+print(json.dumps(profile.report(report_options={"output_format":"pretty"}), indent=4))
+
+
+
+
+
+
+
+
+
+
+
+

Versions

+ +
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/install.html b/docs/0.7.1/html/install.html new file mode 100644 index 000000000..41d258e65 --- /dev/null +++ b/docs/0.7.1/html/install.html @@ -0,0 +1,376 @@ + + + + + + + + + Install - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Install

+

To install the full package from pypi:

+
pip install DataProfiler[ml]
+
+
+

If the ML requirements are too strict (say, you don’t want to install +tensorflow), you can install a slimmer package. The slimmer package disables +the default sensitive data detection / entity recognition (labler)

+

Install from pypi:

+
pip install DataProfiler
+
+
+
+

Snappy Installation

+

This is required to profile parquet/avro datasets

+

MacOS with homebrew:

+
brew install snappy
+
+
+

Linux install:

+
sudo apt-get -y install libsnappy-dev
+
+
+
+
+

Build From Scratch

+

NOTE: Installation for python3

+

virtualenv install:

+
python3 -m pip install virtualenv
+
+
+

Setup virtual env:

+
python3 -m virtualenv --python=python3 venv3
+source venv3/bin/activate
+
+
+

Install requirements:

+
pip3 install -r requirements.txt
+
+
+

Install labeler dependencies:

+
pip3 install -r requirements-ml.txt
+
+
+

Install via the repo – Build setup.py and install locally:

+
python3 setup.py sdist bdist bdist_wheel
+pip3 install dist/DataProfiler*-py3-none-any.whl
+
+
+

If you see:

+
ERROR: Double requirement given:dataprofiler==X.Y.Z from dataprofiler/dist/DataProfiler-X.Y.Z-py3-none-any.whl (already in dataprofiler==X2.Y2.Z2 from dataprofiler/dist/DataProfiler-X2.Y2.Z2-py3-none-any.whl, name='dataprofiler')
+
+
+

This means that you have multiple versions of the DataProfiler distribution +in the dist folder. +To resolve, either remove the older one or delete the folder and rerun the steps +above.

+

Install via github:

+
pip3 install git+https://github.com/capitalone/dataprofiler.git#egg=dataprofiler
+
+
+
+
+

Testing

+

For testing, install test requirements:

+
pip3 install -r requirements-test.txt
+
+
+

To run all unit tests, use:

+
DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py"
+
+
+

To run file of unit tests, use form:

+
DATAPROFILER_SEED=0 python3 -m unittest discover -p test_profile_builder.py
+
+
+

To run a file with Pytest use:

+
DATAPROFILER_SEED=0 pytest dataprofiler/tests/data_readers/test_csv_data.py -v
+
+
+

To run individual of unit test, use form:

+
DATAPROFILER_SEED=0 python3 -m unittest dataprofiler.tests.profilers.test_profile_builder.TestProfiler
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/labeler.html b/docs/0.7.1/html/labeler.html new file mode 100644 index 000000000..a3a3d75eb --- /dev/null +++ b/docs/0.7.1/html/labeler.html @@ -0,0 +1,916 @@ + + + + + + + + + Sensitive Data Detection with the Labeler - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Sensitive Data Detection with the Labeler

+

In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.

+

First, let’s dive into what the Labeler is.

+
+

What is the Labeler

+

The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor.

+alt text +

Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results.

+

Now let’s run some examples. Start by importing all the requirements.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+import pandas as pd
+sys.path.insert(0, '..')
+import dataprofiler as dp
+import tensorflow as tf
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+
+
+
+
+

Structured Data Prediction

+

We’ll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, found here!

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df_data = data.data
+df_data.head()
+
+
+
+

We can directly predict the labels of a structured dataset on the cell level.

+
+
[ ]:
+
+
+
+labeler = dp.DataLabeler(labeler_type='structured')
+
+# print out the labels and label mapping
+print("Labels: {}".format(labeler.labels))
+print("\n")
+print("Label Mapping: {}".format(labeler.label_mapping))
+print("\n")
+
+# make predictions and get labels for each cell going row by row
+# predict options are model dependent and the default model can show prediction confidences
+predictions = labeler.predict(data, predict_options={"show_confidences": True})
+
+# display prediction results
+print("Predictions: {}".format(predictions['pred']))
+print("\n")
+
+# display confidence results
+print("Confidences: {}".format(predictions['conf']))
+
+
+
+

The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick.

+
+
[ ]:
+
+
+
+# set options to only run the labeler
+profile_options = dp.ProfilerOptions()
+profile_options.set({"structured_options.text.is_enabled": False,
+                     "int.is_enabled": False,
+                     "float.is_enabled": False,
+                     "order.is_enabled": False,
+                     "category.is_enabled": False,
+                     "datetime.is_enabled": False,})
+
+profile = dp.Profiler(data, options=profile_options)
+
+# get the prediction from the data profiler
+def get_structured_results(results):
+    columns = []
+    predictions = []
+    for col_report in results['data_stats']:
+        columns.append(col_report['column_name'])
+        predictions.append(col_report['data_label'])
+
+    df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})
+    return df_results
+
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+

In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns.

+
+
+

Unstructured Data Prediction

+

Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample.

+
+
[ ]:
+
+
+
+# load data
+data = "Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\n" + \
+        "Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\n" + \
+        "From: w..smith@company.com\n" + \
+        "To: john.smith@company.com\n" + \
+        "Subject: RE: ABC\n" + \
+        "Mime-Version: 1.0\n" + \
+        "Content-Type: text/plain; charset=us-ascii\n" + \
+        "Content-Transfer-Encoding: 7bit\n" + \
+        "X-From: Smith, Mary W. </O=ENRON/OU=NA/CN=RECIPIENTS/CN=SSMITH>\n" + \
+        "X-To: Smith, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JSMITH>\n" + \
+        "X-cc: \n" + \
+        "X-bcc: \n" + \
+        "X-Folder: \SSMITH (Non-Privileged)\Sent Items\n" + \
+        "X-Origin: Smith-S\n" + \
+        "X-FileName: SSMITH (Non-Privileged).pst\n\n" + \
+        "All I ever saw was the e-mail from the office.\n\n" + \
+        "Mary\n\n" + \
+        "-----Original Message-----\n" + \
+        "From:   Smith, John  \n" + \
+        "Sent:   Friday, August 10, 2005 13:07 PM\n" + \
+        "To:     Smith, Mary W.\n" + \
+        "Subject:        ABC\n\n" + \
+        "Have you heard any more regarding the ABC sale? I guess that means that " + \
+        "it's no big deal here, but you think they would have send something.\n\n\n" + \
+        "John Smith\n" + \
+        "123-456-7890\n"
+
+# convert string data to list to feed into the labeler
+data = [data]
+
+
+
+

By default, the Labeler predicts the results at the character level for unstructured text.

+
+
[ ]:
+
+
+
+labeler = dp.DataLabeler(labeler_type='unstructured')
+
+# make predictions and get labels per character
+predictions = labeler.predict(data)
+
+# display results
+print(predictions['pred'])
+
+
+
+

In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy.

+
+
[ ]:
+
+
+
+# convert prediction to word format and ner format
+# Set the output to the NER format (start position, end position, label)
+labeler.set_params(
+    { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } }
+)
+
+# make predictions and get labels per character
+predictions = labeler.predict(data)
+
+# display results
+print('\n')
+print('=======================Prediction======================\n')
+for pred in predictions['pred'][0]:
+    print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))
+    print('--------------------------------------------------------')
+
+
+
+

Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample.

+
+
+

Train the Labeler from Scratch

+

The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity’s sake, let’s only train a few epochs with a subset of a dataset.

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df = data.data[["OPEID6", "INSTURL", "SEARCH_STRING"]]
+df.head()
+
+# split data to training and test set
+split_ratio = 0.2
+df = df.sample(frac=1).reset_index(drop=True)
+data_train = df[:int((1 - split_ratio) * len(df))]
+data_test = df[int((1 - split_ratio) * len(df)):]
+
+# train a new labeler with column names as labels
+if not os.path.exists('data_labeler_saved'):
+    os.makedirs('data_labeler_saved')
+
+labeler = dp.train_structured_labeler(
+    data=data_train,
+    save_dirpath="data_labeler_saved",
+    epochs=10,
+    default_label="OPEID6"
+)
+
+
+
+
+

The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset.

+
+
[ ]:
+
+
+
+# predict with the labeler object
+profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})
+profile = dp.Profiler(data_test, options=profile_options)
+
+# get the prediction from the data profiler
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+

Another way to use the trained Labeler is through the directory path of the saved labeler.

+
+
[ ]:
+
+
+
+# predict with the labeler loaded from path
+profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})
+profile = dp.Profiler(data_test, options=profile_options)
+
+# get the prediction from the data profiler
+results = profile.report()
+print(get_structured_results(results))
+
+
+
+
+
+

Transfer Learning a Labeler

+

Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity’s sake, let’s only train a few epochs with a small dataset at the cost of accuracy.

+
+
[ ]:
+
+
+
+data = dp.Data("../dataprofiler/tests/data/csv/SchoolDataSmall.csv")
+df_data = data.data[["OPEID6", "INSTURL", "SEARCH_STRING"]]
+
+
+# prep data
+df_data = df_data.reset_index(drop=True).melt()
+df_data.columns = [1, 0]  # labels=1, values=0 in that order
+df_data = df_data.astype(str)
+new_labels = df_data[1].unique().tolist()
+
+# load structured Labeler w/ trainable set to True
+labeler = dp.DataLabeler(labeler_type='structured', trainable=True)
+
+# Reconstruct the model to add each new label
+for label in new_labels:
+    labeler.add_label(label)
+
+# this will use transfer learning to retrain the labeler on your new
+# dataset and labels.
+# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones
+# Setting the reset_weights parameter to false allows transfer learning to occur
+model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2,
+                                 epochs=10, labels=None, reset_weights=False)
+
+
+
+

Let’s display the training results of the last epoch:

+
+
[ ]:
+
+
+
+print("{:16s}  Precision  Recall  F1-score  Support".format(""))
+for item in model_results[-1][2]:
+    print("{:16s}  {:4.3f}      {:4.3f}   {:4.3f}     {:7.0f}".format(item,
+                                                                      model_results[-1][2][item]["precision"],
+                                                                      model_results[-1][2][item]["recall"],
+                                                                      model_results[-1][2][item]["f1-score"],
+                                                                      model_results[-1][2][item]["support"]))
+
+
+
+

It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity’s sake and that real training would involve more samples and better results.

+
+
+

Saving and Loading a Labeler

+

The Labeler can easily be saved or loaded with one simple line.

+
+
[ ]:
+
+
+
+# Ensure save directory exists
+if not os.path.exists('my_labeler'):
+    os.makedirs('my_labeler')
+
+# Saving the labeler
+labeler.save_to_disk("my_labeler")
+
+# Loading the labeler
+labeler = dp.DataLabeler(labeler_type='structured', dirpath="my_labeler")
+
+
+
+
+
+

Building a Labeler from the Ground Up

+

As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline.

+
+
[ ]:
+
+
+
+import random
+from dataprofiler.labelers.character_level_cnn_model import \
+    CharacterLevelCnnModel
+from dataprofiler.labelers.data_processing import \
+    StructCharPreprocessor, StructCharPostprocessor
+
+model = CharacterLevelCnnModel({"PAD":0, "UNKNOWN":1, "Test_Label":2})
+preprocessor = StructCharPreprocessor()
+postprocessor = StructCharPostprocessor()
+
+labeler = dp.DataLabeler(labeler_type='structured')
+labeler.set_preprocessor(preprocessor)
+labeler.set_model(model)
+labeler.set_postprocessor(postprocessor)
+
+# check for basic compatibility between the processors and the model
+labeler.check_pipeline()
+
+# Optionally set the parameters
+parameters={
+    'preprocessor':{
+        'max_length': 100,
+    },
+    'model':{
+        'max_length': 100,
+    },
+    'postprocessor':{
+        'random_state': random.Random(1)
+    }
+}
+labeler.set_params(parameters)
+
+labeler.help()
+
+
+
+

The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the documentation. In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and +character levels. Users can also train the labeler on their own datasets.

+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/labeler.ipynb b/docs/0.7.1/html/labeler.ipynb new file mode 100644 index 000000000..c0d0bf359 --- /dev/null +++ b/docs/0.7.1/html/labeler.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "spoken-reunion", + "metadata": {}, + "source": [ + "# Sensitive Data Detection with the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "interesting-bidder", + "metadata": {}, + "source": [ + "In this example, we utilize the Labeler component of the Data Profiler to detect the sensitive information for both structured and unstructured data. In addition, we show how to train the Labeler on some specific dataset with different list of entities.\n", + "\n", + "First, let's dive into what the Labeler is." + ] + }, + { + "cell_type": "markdown", + "id": "1965b83b", + "metadata": {}, + "source": [ + "## What is the Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "388c643f", + "metadata": {}, + "source": [ + "The Labeler is a pipeline designed to make building, training, and predictions with ML models quick and easy. There are 3 major components to the Labeler: the preprocessor, the model, and the postprocessor." + ] + }, + { + "cell_type": "markdown", + "id": "e5d0aeb4", + "metadata": {}, + "source": [ + "![alt text](DL-Flowchart.png \"Title\")" + ] + }, + { + "cell_type": "markdown", + "id": "550323c7", + "metadata": {}, + "source": [ + "Each component can be switched out individually to suit your needs. As you might expect, the preprocessor takes in raw data and prepares it for the model, the model performs the prediction or training, and the postprocessor takes prediction results and turns them into human-readable results. \n", + "\n", + "Now let's run some examples. Start by importing all the requirements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "scientific-stevens", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "import tensorflow as tf\n", + "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "id": "5125b215", + "metadata": {}, + "source": [ + "## Structured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "wicked-devon", + "metadata": {}, + "source": [ + "We'll use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. This dataset is from the US department of educations, [found here!](https://data.ed.gov/dataset/college-scorecard-all-data-files-through-6-2020/resources?resource=823ac095-bdfc-41b0-b508-4e8fc3110082)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adjusted-native", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data\n", + "df_data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "ab6ccf8a", + "metadata": {}, + "source": [ + "We can directly predict the labels of a structured dataset on the cell level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19529af4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "\n", + "# print out the labels and label mapping\n", + "print(\"Labels: {}\".format(labeler.labels)) \n", + "print(\"\\n\")\n", + "print(\"Label Mapping: {}\".format(labeler.label_mapping))\n", + "print(\"\\n\")\n", + "\n", + "# make predictions and get labels for each cell going row by row\n", + "# predict options are model dependent and the default model can show prediction confidences\n", + "predictions = labeler.predict(data, predict_options={\"show_confidences\": True})\n", + "\n", + "# display prediction results\n", + "print(\"Predictions: {}\".format(predictions['pred']))\n", + "print(\"\\n\")\n", + "\n", + "# display confidence results\n", + "print(\"Confidences: {}\".format(predictions['conf']))" + ] + }, + { + "cell_type": "markdown", + "id": "2af72e2c", + "metadata": {}, + "source": [ + "The profiler uses the Labeler to perform column by column predictions. The data contains 11 columns, each of which has data label. Next, we will use the Labeler of the Data Profiler to predict the label for each column in this tabular dataset. Since we are only going to demo the labeling functionality, other options of the Data Profiler are disabled to keep this quick." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "secret-million", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# set options to only run the labeler\n", + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"structured_options.text.is_enabled\": False, \n", + " \"int.is_enabled\": False, \n", + " \"float.is_enabled\": False, \n", + " \"order.is_enabled\": False, \n", + " \"category.is_enabled\": False, \n", + " \"datetime.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "def get_structured_results(results):\n", + " columns = []\n", + " predictions = []\n", + " for col_report in results['data_stats']:\n", + " columns.append(col_report['column_name'])\n", + " predictions.append(col_report['data_label'])\n", + "\n", + " df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})\n", + " return df_results\n", + "\n", + "results = profile.report() \n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "fatty-louisville", + "metadata": {}, + "source": [ + "In this example, the results show that the Data Profiler is able to detect integers, URLs, address, and floats appropriately. Unknown is typically strings of text, which is appropriate for those columns." + ] + }, + { + "cell_type": "markdown", + "id": "unavailable-diploma", + "metadata": {}, + "source": [ + "## Unstructured Data Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "metallic-coaching", + "metadata": {}, + "source": [ + "Besides structured data, the Labeler detects the sensitive information on the unstructured text. We use a sample of spam email in Enron email dataset for this demo. As above, we start investigating the content of the given email sample." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unauthorized-lounge", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# load data\n", + "data = \"Message-ID: <11111111.1111111111111.JavaMail.evans@thyme>\\n\" + \\\n", + " \"Date: Fri, 10 Aug 2005 11:31:37 -0700 (PDT)\\n\" + \\\n", + " \"From: w..smith@company.com\\n\" + \\\n", + " \"To: john.smith@company.com\\n\" + \\\n", + " \"Subject: RE: ABC\\n\" + \\\n", + " \"Mime-Version: 1.0\\n\" + \\\n", + " \"Content-Type: text/plain; charset=us-ascii\\n\" + \\\n", + " \"Content-Transfer-Encoding: 7bit\\n\" + \\\n", + " \"X-From: Smith, Mary W. \\n\" + \\\n", + " \"X-To: Smith, John \\n\" + \\\n", + " \"X-cc: \\n\" + \\\n", + " \"X-bcc: \\n\" + \\\n", + " \"X-Folder: \\SSMITH (Non-Privileged)\\Sent Items\\n\" + \\\n", + " \"X-Origin: Smith-S\\n\" + \\\n", + " \"X-FileName: SSMITH (Non-Privileged).pst\\n\\n\" + \\\n", + " \"All I ever saw was the e-mail from the office.\\n\\n\" + \\\n", + " \"Mary\\n\\n\" + \\\n", + " \"-----Original Message-----\\n\" + \\\n", + " \"From: Smith, John \\n\" + \\\n", + " \"Sent: Friday, August 10, 2005 13:07 PM\\n\" + \\\n", + " \"To: Smith, Mary W.\\n\" + \\\n", + " \"Subject: ABC\\n\\n\" + \\\n", + " \"Have you heard any more regarding the ABC sale? I guess that means that \" + \\\n", + " \"it's no big deal here, but you think they would have send something.\\n\\n\\n\" + \\\n", + " \"John Smith\\n\" + \\\n", + " \"123-456-7890\\n\"\n", + "\n", + "# convert string data to list to feed into the labeler\n", + "data = [data]" + ] + }, + { + "cell_type": "markdown", + "id": "concerned-segment", + "metadata": {}, + "source": [ + "By default, the Labeler predicts the results at the character level for unstructured text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "junior-acrobat", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "labeler = dp.DataLabeler(labeler_type='unstructured')\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print(predictions['pred'])" + ] + }, + { + "cell_type": "markdown", + "id": "individual-diabetes", + "metadata": {}, + "source": [ + "In addition to the character-level result, the Labeler provides the results at the word level following the standard NER (Named Entity Recognition), e.g., utilized by spaCy. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "optical-universe", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# convert prediction to word format and ner format\n", + "# Set the output to the NER format (start position, end position, label)\n", + "labeler.set_params(\n", + " { 'postprocessor': { 'output_format':'ner', 'use_word_level_argmax':True } } \n", + ")\n", + "\n", + "# make predictions and get labels per character\n", + "predictions = labeler.predict(data)\n", + "\n", + "# display results\n", + "print('\\n')\n", + "print('=======================Prediction======================\\n')\n", + "for pred in predictions['pred'][0]:\n", + " print('{}: {}'.format(data[0][pred[0]: pred[1]], pred[2]))\n", + " print('--------------------------------------------------------')" + ] + }, + { + "cell_type": "markdown", + "id": "behavioral-tourism", + "metadata": {}, + "source": [ + "Here, the Labeler is able to identify sensitive information such as datetime, email address, person names, and phone number in an email sample. " + ] + }, + { + "cell_type": "markdown", + "id": "nasty-disney", + "metadata": {}, + "source": [ + "## Train the Labeler from Scratch" + ] + }, + { + "cell_type": "markdown", + "id": "destroyed-twist", + "metadata": {}, + "source": [ + "The Labeler can be trained from scratch with a new list of labels. Below, we show an example of training the Labeler on a dataset with labels given as the columns of that dataset. For brevity's sake, let's only train a few epochs with a subset of a dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "utility-evaluation", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "df.head()\n", + "\n", + "# split data to training and test set\n", + "split_ratio = 0.2\n", + "df = df.sample(frac=1).reset_index(drop=True)\n", + "data_train = df[:int((1 - split_ratio) * len(df))]\n", + "data_test = df[int((1 - split_ratio) * len(df)):]\n", + "\n", + "# train a new labeler with column names as labels\n", + "if not os.path.exists('data_labeler_saved'):\n", + " os.makedirs('data_labeler_saved')\n", + "\n", + "labeler = dp.train_structured_labeler(\n", + " data=data_train,\n", + " save_dirpath=\"data_labeler_saved\",\n", + " epochs=10,\n", + " default_label=\"OPEID6\"\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "utility-torture", + "metadata": {}, + "source": [ + "The trained Labeler is then used by the Data Profiler to provide the prediction on the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "answering-panel", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# predict with the labeler object\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_object': labeler})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "polish-stand", + "metadata": {}, + "source": [ + "Another way to use the trained Labeler is through the directory path of the saved labeler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "industrial-characterization", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# predict with the labeler loaded from path\n", + "profile_options.set({'structured_options.data_labeler.data_labeler_dirpath': 'data_labeler_saved'})\n", + "profile = dp.Profiler(data_test, options=profile_options)\n", + "\n", + "# get the prediction from the data profiler\n", + "results = profile.report()\n", + "print(get_structured_results(results))" + ] + }, + { + "cell_type": "markdown", + "id": "2acedba0", + "metadata": {}, + "source": [ + "## Transfer Learning a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "2f15fb1f", + "metadata": {}, + "source": [ + "Instead of training a model from scratch, we can also transfer learn to improve the model and/or extend the labels. Again for brevity's sake, let's only train a few epochs with a small dataset at the cost of accuracy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0104c374", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = dp.Data(\"../dataprofiler/tests/data/csv/SchoolDataSmall.csv\")\n", + "df_data = data.data[[\"OPEID6\", \"INSTURL\", \"SEARCH_STRING\"]]\n", + "\n", + "\n", + "# prep data\n", + "df_data = df_data.reset_index(drop=True).melt()\n", + "df_data.columns = [1, 0] # labels=1, values=0 in that order\n", + "df_data = df_data.astype(str)\n", + "new_labels = df_data[1].unique().tolist()\n", + "\n", + "# load structured Labeler w/ trainable set to True\n", + "labeler = dp.DataLabeler(labeler_type='structured', trainable=True)\n", + "\n", + "# Reconstruct the model to add each new label\n", + "for label in new_labels:\n", + " labeler.add_label(label)\n", + "\n", + "# this will use transfer learning to retrain the labeler on your new\n", + "# dataset and labels.\n", + "# Setting labels with a list of labels or label mapping will overwrite the existing labels with new ones\n", + "# Setting the reset_weights parameter to false allows transfer learning to occur\n", + "model_results = labeler.fit(x=df_data[0], y=df_data[1], validation_split=0.2, \n", + " epochs=10, labels=None, reset_weights=False)" + ] + }, + { + "cell_type": "markdown", + "id": "ae78745f", + "metadata": {}, + "source": [ + "Let's display the training results of the last epoch:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b764aa8c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(\"{:16s} Precision Recall F1-score Support\".format(\"\"))\n", + "for item in model_results[-1][2]:\n", + " print(\"{:16s} {:4.3f} {:4.3f} {:4.3f} {:7.0f}\".format(item,\n", + " model_results[-1][2][item][\"precision\"],\n", + " model_results[-1][2][item][\"recall\"],\n", + " model_results[-1][2][item][\"f1-score\"],\n", + " model_results[-1][2][item][\"support\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "44009522", + "metadata": {}, + "source": [ + "It is now trained to detect additional labels! The model results here show all the labels training accuracy. Since only new labels existed in the dataset, only the new labels are given accuracy scores. Keep in mind this is a small dataset for brevity's sake and that real training would involve more samples and better results." + ] + }, + { + "cell_type": "markdown", + "id": "e110ee1c", + "metadata": {}, + "source": [ + "## Saving and Loading a Labeler" + ] + }, + { + "cell_type": "markdown", + "id": "c484d193", + "metadata": {}, + "source": [ + "The Labeler can easily be saved or loaded with one simple line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d8684fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure save directory exists\n", + "if not os.path.exists('my_labeler'):\n", + " os.makedirs('my_labeler')\n", + "\n", + "# Saving the labeler\n", + "labeler.save_to_disk(\"my_labeler\")\n", + "\n", + "# Loading the labeler\n", + "labeler = dp.DataLabeler(labeler_type='structured', dirpath=\"my_labeler\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d36dec8", + "metadata": {}, + "source": [ + "## Building a Labeler from the Ground Up" + ] + }, + { + "cell_type": "markdown", + "id": "59346d2b", + "metadata": {}, + "source": [ + "As mentioned earlier, the labeler is comprised of three components, and each of the compenents can be created and interchanged in the the labeler pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6506ef97", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "from dataprofiler.labelers.character_level_cnn_model import \\\n", + " CharacterLevelCnnModel\n", + "from dataprofiler.labelers.data_processing import \\\n", + " StructCharPreprocessor, StructCharPostprocessor\n", + "\n", + "model = CharacterLevelCnnModel({\"PAD\":0, \"UNKNOWN\":1, \"Test_Label\":2})\n", + "preprocessor = StructCharPreprocessor()\n", + "postprocessor = StructCharPostprocessor()\n", + "\n", + "labeler = dp.DataLabeler(labeler_type='structured')\n", + "labeler.set_preprocessor(preprocessor)\n", + "labeler.set_model(model)\n", + "labeler.set_postprocessor(postprocessor)\n", + "\n", + "# check for basic compatibility between the processors and the model\n", + "labeler.check_pipeline()\n", + "\n", + "# Optionally set the parameters\n", + "parameters={\n", + " 'preprocessor':{\n", + " 'max_length': 100,\n", + " },\n", + " 'model':{\n", + " 'max_length': 100,\n", + " },\n", + " 'postprocessor':{\n", + " 'random_state': random.Random(1)\n", + " }\n", + "} \n", + "labeler.set_params(parameters)\n", + "\n", + "labeler.help()" + ] + }, + { + "cell_type": "markdown", + "id": "5f020d7f", + "metadata": {}, + "source": [ + "The components can each be created if you inherit the BaseModel and BaseProcessor for the model and processors, respectively. More info can be found about coding your own components in the Labeler section of the [documentation]( https://capitalone.github.io/dataprofiler). In summary, the Data Profiler open source library can be used to scan sensitive information in both structured and unstructured data with different file types. It supports multiple input formats and output formats at word and character levels. Users can also train the labeler on their own datasets." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/html/modules.html b/docs/0.7.1/html/modules.html new file mode 100644 index 000000000..d1f555911 --- /dev/null +++ b/docs/0.7.1/html/modules.html @@ -0,0 +1,286 @@ + + + + + + + + + dataprofiler - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+ + +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/objects.inv b/docs/0.7.1/html/objects.inv new file mode 100644 index 000000000..2be8fdbcf Binary files /dev/null and b/docs/0.7.1/html/objects.inv differ diff --git a/docs/0.7.1/html/overview.html b/docs/0.7.1/html/overview.html new file mode 100644 index 000000000..3fe29206e --- /dev/null +++ b/docs/0.7.1/html/overview.html @@ -0,0 +1,834 @@ + + + + + + + + + Data Profiler - What’s in your data? - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Data Profiler - What’s in your data?

+

This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large +datasets, or merge multiple profiles. In particular, this example covers the followings:

+
- Basic usage of the Data Profiler
+- The data reader class
+- Profiler options
+- Updating profiles and merging profiles
+
+
+

First, let’s import the libraries needed for this example.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+sys.path.insert(0, '..')
+import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+
+
+
+

Basic Usage of the Data Profiler

+

This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics.

+
+
[ ]:
+
+
+
+# use data reader to read input data
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+print(data.data.head())
+
+# run data profiler and get the report
+profile = dp.Profiler(data)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report, indent=4))
+
+
+
+

The report includes global_stats and data_stats for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the compact format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity +level from the Data Labeler component or histogram results, the format pretty should be used.

+
+
+

Data reader class

+

DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions.

+
+
[ ]:
+
+
+
+# use data reader to read input data with different file types
+csv_files = [
+    "csv/aws_honeypot_marx_geo.csv",
+    "csv/all-strings-skip-header-author.csv", # csv files with the author/description on the first line
+    "csv/sparse-first-and-last-column-empty-first-row.txt", # csv file with the .txt extension
+]
+json_files = [
+    "json/complex_nested.json",
+    "json/honeypot_intentially_mislabeled_file.csv", # json file with the .csv extension
+]
+parquet_files = [
+    "parquet/nation.dict.parquet",
+    "parquet/nation.plain.intentionally_mislabled_file.csv", # parquet file with the .csv extension
+]
+avro_files = [
+    "avro/userdata1.avro",
+    "avro/userdata1_intentionally_mislabled_file.json", # avro file with the .json extension
+]
+text_files = [
+    "txt/discussion_reddit.txt",
+]
+
+all_files = {
+    "csv": csv_files,
+    "json": json_files,
+    "parquet": parquet_files,
+    "avro": avro_files,
+    "text": text_files
+}
+
+for file_type in all_files:
+    print(file_type)
+    for file in all_files[file_type]:
+        data = dp.Data(os.path.join(data_path, file))
+        print("{:<85} {:<15}".format(file, data.data_type))
+    print("\n")
+
+
+
+

The Data class detects the file type and uses one of the following classes: CSVData, JSONData, ParquetData, AVROData, TextData. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class.

+
+
[ ]:
+
+
+
+# use individual data reader classes
+from dataprofiler.data_readers.csv_data import CSVData
+from dataprofiler.data_readers.json_data import JSONData
+from dataprofiler.data_readers.parquet_data import ParquetData
+from dataprofiler.data_readers.avro_data import AVROData
+from dataprofiler.data_readers.text_data import TextData
+
+csv_files = "csv/aws_honeypot_marx_geo.csv"
+json_files = "json/complex_nested.json"
+parquet_files = "parquet/nation.dict.parquet"
+avro_files = "avro/userdata1.avro"
+text_files = "txt/discussion_reddit.txt"
+
+all_files = {
+    "csv": [csv_files, CSVData],
+    "json": [json_files, JSONData],
+    "parquet": [parquet_files, ParquetData],
+    "avro": [avro_files, AVROData],
+    "text": [text_files, TextData],
+}
+
+for file_type in all_files:
+    file, data_reader = all_files[file_type]
+    data = data_reader(os.path.join(data_path, file))
+    print("File name {}\n".format(file))
+    if file_type == "text":
+        print(data.data[0][:1000]) # print the first 1000 characters
+    else:
+        print(data.data)
+    print('===============================================================================')
+
+
+
+

In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=["col_int", "col_float"])
+profile = dp.Profiler(my_dataframe)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Structured Profiler vs. Unstructured Profiler

+

The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile.

+
+
[ ]:
+
+
+
+# Using the structured profiler
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data, profiler_type='structured')
+
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+# Using the unstructured profiler
+my_dataframe = pd.DataFrame([["Sample1"],["Sample2"],["Sample3"]], columns=["Text_Samples"])
+profile = dp.Profiler(my_dataframe, profiler_type='unstructured')
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler options

+

The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let’s remove the histogram and data labeler component while running Data Profiler.

+
+
[ ]:
+
+
+
+profile_options = dp.ProfilerOptions()
+profile_options.set({"histogram_and_quantiles.is_enabled": False,
+                     "data_labeler.is_enabled": False,})
+
+profile = dp.Profiler(my_dataframe, options=profile_options)
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+

Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:

+
+
[ ]:
+
+
+
+profile_options = dp.ProfilerOptions()
+profile_options.structured_options.data_labeler.sample_size = 1
+profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = "rice"
+# An error will raise if the options are set incorrectly.
+profile_options.validate()
+
+profile = dp.Profiler(my_dataframe, options=profile_options)
+report  = profile.report(report_options={"output_format":"pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Update profiles

+

One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially.

+

After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in global_stats of the profiles including column_count, row_count, row_is_null_ratio, duplicate_row_count.

+
+
[ ]:
+
+
+
+# read the input data and devide it into two equal halves
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+df = data.data
+df1 = df.iloc[:int(len(df)/2)]
+df2 = df.iloc[int(len(df)/2):]
+
+# Update the profile with the first half
+profile = dp.Profiler(df1)
+
+# Update the profile with the second half
+profile.update_profile(df2)
+
+# Update profile with the full dataset
+profile_full = dp.Profiler(df)
+
+report  = profile.report(report_options={"output_format":"compact"})
+report_full  = profile_full.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report, indent=4))
+print(json.dumps(report_full, indent=4))
+
+
+
+

You can see that the profiles are exactly the same whether they are broken into several updates or not.

+
+
+

Merge profiles

+

In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.

+

As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset.

+
+
[ ]:
+
+
+
+# Update the profile with the first half
+profile1 = dp.Profiler(df1)
+
+# Update the profile with the second half
+profile2 = dp.Profiler(df2)
+
+# merge profiles
+profile_merge = profile1 + profile2
+
+# check results of the merged profile
+report_merge  = profile.report(report_options={"output_format":"compact"})
+
+# print the report
+print(json.dumps(report_merge, indent=4))
+print(json.dumps(report_full, indent=4))
+
+
+
+

You can see that the profiles are exactly the same!

+
+
+

Conclusion

+

We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired.

+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/overview.ipynb b/docs/0.7.1/html/overview.ipynb new file mode 100644 index 000000000..dafec60ab --- /dev/null +++ b/docs/0.7.1/html/overview.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fc2826d9", + "metadata": {}, + "source": [ + "# Data Profiler - What's in your data?" + ] + }, + { + "cell_type": "markdown", + "id": "b997522b", + "metadata": {}, + "source": [ + "This introductory jupyter notebook demonstrates the basic usages of the Data Profiler. The library is designed to easily detect sensitive data and gather statistics on your datasets with just several lines of code. The Data Profiler can handle several different data types including: CSV (or any delimited file), JSON, Parquet, AVRO, and text. Additionally, there are a plethora of options to customize your profile. This library also has the ability to update profiles from multiple batches of large datasets, or merge multiple profiles. In particular, this example covers the followings:\n", + "\n", + " - Basic usage of the Data Profiler\n", + " - The data reader class\n", + " - Profiler options\n", + " - Updating profiles and merging profiles\n", + "\n", + "First, let's import the libraries needed for this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef404c84", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "markdown", + "id": "f51971e3", + "metadata": {}, + "source": [ + "## Basic Usage of the Data Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "639e66d3", + "metadata": {}, + "source": [ + "This section shows the basic example of the Data Profiler. A CSV dataset is read using the data reader, then the Data object is given to the Data Profiler to detect sensitive data and obtain the statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5379c45c", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "print(data.data.head())\n", + "\n", + "# run data profiler and get the report\n", + "profile = dp.Profiler(data)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "57fe2827", + "metadata": {}, + "source": [ + "The report includes `global_stats` and `data_stats` for the given dataset. The former contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio, while the latter contains specific properties and statistics for each column such as detected data label, min, max, mean, variance, etc. In this example, the `compact` format of the report is used to shorten the full list of the results. To get more results related to detailed predictions at the entity level from the Data Labeler component or histogram results, the format `pretty` should be used." + ] + }, + { + "cell_type": "markdown", + "id": "74027cfd", + "metadata": {}, + "source": [ + "## Data reader class" + ] + }, + { + "cell_type": "markdown", + "id": "41364888", + "metadata": {}, + "source": [ + "DataProfiler can detect multiple file types including CSV (or any delimited file), JSON, Parquet, AVRO, and text. The example below shows that it successfully detects data types from multiple categories regardless of the file extensions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "823829f4", + "metadata": {}, + "outputs": [], + "source": [ + "# use data reader to read input data with different file types\n", + "csv_files = [\n", + " \"csv/aws_honeypot_marx_geo.csv\",\n", + " \"csv/all-strings-skip-header-author.csv\", # csv files with the author/description on the first line\n", + " \"csv/sparse-first-and-last-column-empty-first-row.txt\", # csv file with the .txt extension\n", + "]\n", + "json_files = [\n", + " \"json/complex_nested.json\",\n", + " \"json/honeypot_intentially_mislabeled_file.csv\", # json file with the .csv extension\n", + "]\n", + "parquet_files = [\n", + " \"parquet/nation.dict.parquet\",\n", + " \"parquet/nation.plain.intentionally_mislabled_file.csv\", # parquet file with the .csv extension\n", + "]\n", + "avro_files = [\n", + " \"avro/userdata1.avro\",\n", + " \"avro/userdata1_intentionally_mislabled_file.json\", # avro file with the .json extension\n", + "]\n", + "text_files = [\n", + " \"txt/discussion_reddit.txt\",\n", + "]\n", + "\n", + "all_files = {\n", + " \"csv\": csv_files,\n", + " \"json\": json_files,\n", + " \"parquet\": parquet_files,\n", + " \"avro\": avro_files,\n", + " \"text\": text_files\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " print(file_type)\n", + " for file in all_files[file_type]:\n", + " data = dp.Data(os.path.join(data_path, file))\n", + " print(\"{:<85} {:<15}\".format(file, data.data_type))\n", + " print(\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9d7e02", + "metadata": {}, + "source": [ + "The `Data` class detects the file type and uses one of the following classes: `CSVData`, `JSONData`, `ParquetData`, `AVROData`, `TextData`. Users can call these specific classes directly if desired. For example, below we provide a collection of data with different types, each of them is processed by the corresponding data class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "831e68a3", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# use individual data reader classes\n", + "from dataprofiler.data_readers.csv_data import CSVData\n", + "from dataprofiler.data_readers.json_data import JSONData\n", + "from dataprofiler.data_readers.parquet_data import ParquetData\n", + "from dataprofiler.data_readers.avro_data import AVROData\n", + "from dataprofiler.data_readers.text_data import TextData\n", + "\n", + "csv_files = \"csv/aws_honeypot_marx_geo.csv\"\n", + "json_files = \"json/complex_nested.json\"\n", + "parquet_files = \"parquet/nation.dict.parquet\"\n", + "avro_files = \"avro/userdata1.avro\"\n", + "text_files = \"txt/discussion_reddit.txt\"\n", + "\n", + "all_files = {\n", + " \"csv\": [csv_files, CSVData],\n", + " \"json\": [json_files, JSONData],\n", + " \"parquet\": [parquet_files, ParquetData],\n", + " \"avro\": [avro_files, AVROData],\n", + " \"text\": [text_files, TextData],\n", + "}\n", + "\n", + "for file_type in all_files:\n", + " file, data_reader = all_files[file_type]\n", + " data = data_reader(os.path.join(data_path, file))\n", + " print(\"File name {}\\n\".format(file))\n", + " if file_type == \"text\":\n", + " print(data.data[0][:1000]) # print the first 1000 characters\n", + " else:\n", + " print(data.data)\n", + " print('===============================================================================')" + ] + }, + { + "cell_type": "markdown", + "id": "572df0a8", + "metadata": {}, + "source": [ + "In addition to reading the input data from multiple file types, the Data Profiler allows the input data as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df87ab83", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "84a06312", + "metadata": {}, + "source": [ + "## Structured Profiler vs. Unstructured Profiler" + ] + }, + { + "cell_type": "markdown", + "id": "4c0ea925", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile and the unstructured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f4565d8", + "metadata": {}, + "outputs": [], + "source": [ + "# Using the structured profiler\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))\n", + "\n", + "# Using the unstructured profiler\n", + "my_dataframe = pd.DataFrame([[\"Sample1\"],[\"Sample2\"],[\"Sample3\"]], columns=[\"Text_Samples\"])\n", + "profile = dp.Profiler(my_dataframe, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b16648ba", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "8b0cc8ad", + "metadata": {}, + "source": [ + "The Data Profiler can enable/disable statistics and modify features through profiler options. For example, if the users only want the statistics information, they may turn off the Data Labeler functionality. Below, let's remove the histogram and data labeler component while running Data Profiler." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbac3a2c", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.set({\"histogram_and_quantiles.is_enabled\": False,\n", + " \"data_labeler.is_enabled\": False,})\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "590ca50b", + "metadata": {}, + "source": [ + "Besides toggling on and off features, other options like the data labeler sample size or histogram bin method can be directly set and validated as shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ed21bc1", + "metadata": {}, + "outputs": [], + "source": [ + "profile_options = dp.ProfilerOptions()\n", + "profile_options.structured_options.data_labeler.sample_size = 1\n", + "profile_options.structured_options.int.histogram_and_quantiles.bin_count_or_method = \"rice\"\n", + "# An error will raise if the options are set incorrectly.\n", + "profile_options.validate()\n", + "\n", + "profile = dp.Profiler(my_dataframe, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "9f690616", + "metadata": {}, + "source": [ + "## Update profiles" + ] + }, + { + "cell_type": "markdown", + "id": "965f8c85", + "metadata": {}, + "source": [ + "One of the interesting features of the Data Profiler is the ability to update profiles from batches of data, which allows for data streaming usage. In this section, the original dataset is separated into two batches with equal size. Each batch is then updated with Data Profiler sequentially. \n", + "\n", + "After the update, we expect the resulted profiles give the same statistics as the profiles updated from the full dataset. We will verify that through some properties in `global_stats` of the profiles including `column_count`, `row_count`, `row_is_null_ratio`, `duplicate_row_count`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34ac4346", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# read the input data and devide it into two equal halves\n", + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "df = data.data\n", + "df1 = df.iloc[:int(len(df)/2)]\n", + "df2 = df.iloc[int(len(df)/2):]\n", + "\n", + "# Update the profile with the first half\n", + "profile = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile.update_profile(df2)\n", + "\n", + "# Update profile with the full dataset\n", + "profile_full = dp.Profiler(df)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "report_full = profile_full.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b41ee2bf", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same whether they are broken into several updates or not." + ] + }, + { + "cell_type": "markdown", + "id": "c547f051", + "metadata": {}, + "source": [ + "## Merge profiles" + ] + }, + { + "cell_type": "markdown", + "id": "a5292962", + "metadata": {}, + "source": [ + "In addition to the profile update, Data Profiler provides the merging functionality which allows users to combine the profiles updated from multiple locations. This enables Data Profiler to be used in a distributed computing environment. Below, we assume that the two aforementioned halves of the original dataset come from two different machines. Each of them is then updated with the Data Profiler on the same machine, then the resulted profiles are merged.\n", + "\n", + "As with the profile update, we expect the merged profiles give the same statistics as the profiles updated from the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a565b8d1", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Update the profile with the first half\n", + "profile1 = dp.Profiler(df1)\n", + "\n", + "# Update the profile with the second half\n", + "profile2 = dp.Profiler(df2)\n", + "\n", + "# merge profiles\n", + "profile_merge = profile1 + profile2\n", + "\n", + "# check results of the merged profile\n", + "report_merge = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# print the report\n", + "print(json.dumps(report_merge, indent=4))\n", + "print(json.dumps(report_full, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "b77fac3f", + "metadata": {}, + "source": [ + "You can see that the profiles are exactly the same!" + ] + }, + { + "cell_type": "markdown", + "id": "c644ee42", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "We have walked through some basic examples of Data Profiler usage, with different input data types and profiling options. We also work with update and merging functionality of the Data Profiler, which make it applicable for data streaming and distributed environment. Interested users can try with different datasets and functionalities as desired." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/html/profiler.html b/docs/0.7.1/html/profiler.html new file mode 100644 index 000000000..727c6018b --- /dev/null +++ b/docs/0.7.1/html/profiler.html @@ -0,0 +1,1025 @@ + + + + + + + + + Profiler - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ +
+

Profiler

+
+

Profile Your Data

+

Profiling your data is easy. Just use the data reader, send the data to the +profiler, and print out the report.

+
import json
+from dataprofiler import Data, Profiler
+
+data = Data("your_file.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text
+
+profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc
+
+readable_report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(readable_report, indent=4))
+
+
+

If the data is structured, the profile will return global statistics as well as +column by column statistics. The vast amount of statistics are listed on the +intro page.

+
+

Load a File

+

The profiler should automatically identify the file type and load the data into a Data Class.

+

Along with other attributtes the Data class enables structured data to be accessed via a valid Pandas DataFrame.

+
# Load a csv file, return a CSVData object
+csv_data = Data('your_file.csv')
+
+# Print the first 10 rows of the csv file
+print(csv_data.data.head(10))
+
+# Load a parquet file, return a ParquetData object
+parquet_data = Data('your_file.parquet')
+
+# Sort the data by the name column
+parquet_data.data.sort_values(by='name', inplace=True)
+
+# Print the sorted first 10 rows of the parquet data
+print(parquet_data.data.head(10))
+
+
+

If the file type is not automatically identified (rare), you can specify them +specifically, see section Data Readers.

+
+
+

Profile a File

+

Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load file (CSV should be automatically identified)
+data = Data("your_file.csv")
+
+# Profile the dataset
+profile = Profiler(data)
+
+# Generate a report and use json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+

Updating Profiles

+

Currently, the data profiler is equipped to update its profile in batches.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile = Profiler(data)
+
+# Update the profile with new data:
+new_data = Data("new_data.csv")
+profile.update_profile(new_data)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Merging Profiles

+

If you have two files with the same schema (but different data), it is possible to merge the two profiles together via an addition operator.

+

This also enables profiles to be determined in a distributed manner.

+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file with a schema
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file with the same schema
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+profile3 = profile1 + profile2
+
+# Print the report using json to prettify.
+report  = profile3.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Profile Differences

+

Profile differences take two profiles and find the differences +between them. Create the difference report like this:

+
from dataprofiler import Data, Profiler
+
+# Load a CSV file
+data1 = Data("file_a.csv")
+profile1 = Profiler(data)
+
+# Load another CSV file
+data2 = Data("file_b.csv")
+profile2 = Profiler(data)
+
+diff_report = profile1.diff(profile2)
+print(diff_report)
+
+
+

The difference report contains a dictionary that mirrors the profile report. +Each data type has its own difference:

+
    +
  • Int/Float - One profile subtracts the value from the other.

  • +
  • String - The strings will be shown in a list:

    +
      +
    • [profile1 str, profile2 str]

    • +
    +
  • +
  • List - A list of 3 will be returned showing the unique values of +each profile and the shared values:

    +
      +
    • [profile 1 unique values, shared values, profile 2 unique values]

    • +
    +
  • +
  • Dict - Some dictionaries with varied keys will also return a list +of three in the format:

    +
      +
    • [profile 1 unique key-values, shared key differences, profile 2 unique key-values]

    • +
    +
  • +
+

Otherwise, when no differences occur:

+
    +
  • Any Type No Differences - A string will report: “unchanged”.

  • +
+

Below is the structured difference report:

+
{
+    'global_stats': {
+        'file_type': [str, str],
+        'encoding': [str, str],
+        'samples_used': int,
+        'column_count': int,
+        'row_count': int,
+        'row_has_null_ratio': float,
+        'row_is_null_ratio': float,
+        'unique_row_ratio': float,
+        'duplicate_row_count': int,
+        'correlation_matrix': list[list[float]],
+        'profile_schema': list[dict[str, int]]
+    },
+    'data_stats': [{
+        'column_name': str,
+        'data_type': [str, str],
+        'data_label': [list[str], list[str], list[str]],
+        'categorical': [str, str],
+        'order': [str, str],
+        'statistics': {
+            'min': float,
+            'max': float,
+            'sum': float,
+            'mean': float,
+            'variance': float,
+            'stddev': float,
+            't-test': {
+                't-statistic': float,
+                'conservative': {'df': int,
+                                 'p-value': float},
+                'welch': {'df': float,
+                          'p-value': float}},
+            "chi2-test": {
+                "chi2-statistic": float,
+                "df": int,
+                "p-value": float
+            },
+            'unique_count': int,
+            'unique_ratio': float,
+            'categories': [list[str], list[str], list[str]],
+            'gini_impurity': float,
+            'unalikeability': float,
+            'categorical_count': [dict[str, int], dict[str, int], dict[str, int]],
+            'avg_predictions': [dict[str, float]],
+            'label_representation': [dict[str, float]],
+            'sample_size': int,
+            'null_count': int,
+            'null_types': [list[str], list[str], list[str]],
+            'null_types_index': [dict[str, int], dict[str, int], dict[str, int]],
+            'data_type_representation': [dict[str, float]]
+        }
+    }
+
+
+

Below is the unstructured difference report:

+
{
+    'global_stats': {
+        'file_type': [str, str],
+        'encoding': [str, str],
+        'samples_used': int,
+        'empty_line_count': int,
+        'memory_size': float
+    },
+    'data_stats': {
+        'data_label': {
+            'entity_counts': {
+                'word_level': dict[str, int],
+                'true_char_level': dict[str, int],
+                'postprocess_char_level': dict[str, int]
+            },
+            'entity_percentages': {
+                'word_level': dict[str, float],
+                'true_char_level': dict[str, float],
+                'postprocess_char_level': dict[str, float]
+            }
+        },
+        'statistics': {
+            'vocab': [list[str], list[str], list[str]],
+            'vocab_count': [dict[str, int], dict[str, int], dict[str, int]],
+            'words': [list[str], list[str], list[str]],
+            'word_count': [dict[str, int], dict[str, int], dict[str, int]]
+        }
+    }
+}
+
+
+
+
+

Saving and Loading a Profile

+

The profiles can easily be saved and loaded as shown below:

+
import json
+from dataprofiler import Data, Profiler
+
+# Load a CSV file, with "," as the delimiter
+data = Data("your_file.csv")
+
+# Read in profile and print results
+profile = Profiler(data)
+profile.save(filepath="my_profile.pkl")
+
+loaded_profile = dp.Profiler.load("my_profile.pkl")
+print(json.dumps(loaded_profile.report(report_options={"output_format": "compact"}),
+                                       indent=4))
+
+
+
+
+

Structured vs Unstructured Profiles

+

When using the profiler, the data profiler will automatically infer whether to +create the structured profile or the unstructured profile. However, you can be +explicit as shown below:

+
import json
+from dataprofiler import Data, Profiler
+
+# Creating a structured profile
+data1 = Data("normal_csv_file.csv")
+structured_profile = Profiler(data1, profiler_type="structured")
+
+structured_report = structured_profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(structured_report, indent=4))
+
+# Creating an unstructured profile
+data2 = Data("normal_text_file.txt")
+unstructured_profile = Profiler(data2, profiler_type="unstructured")
+
+unstructured_report = unstructured_profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(unstructured_report, indent=4))
+
+
+
+
+

Setting the Sample Size

+

There are two ways to set sample size in a profile: samples_per_update and +min_true_samples. Samples_per_update takes an integer as the exact amount that +will be sampled. Min_true_samples will set the minimum amount of samples that +are not null. For example:

+
from dataprofiler import Profiler
+
+sample_array = [1.0, NULL, 2.0]
+profile = dp.Profiler(sample_array, samples_per_update=2)
+
+
+

The first two samples (1.0 and NULL) are used for the statistical analysis.

+

In contrast, if we also set min_true_samples to 2 then the Data Reader will +continue to read until the minimum true samples were found for the given column. +For example:

+
from dataprofiler import Profiler
+
+sample_array = [1.0, NULL, 2.0]
+profile = dp.Profiler(sample_array, samples_per_update=2, min_true_samples=2)
+
+
+

This will use all samples in the statistical analysis until the number of “true” +(non-NULL) values are reached. Both min_true_samples and +samples_per_update conditions must be met. In this case, the profile will grab +the first two samples (1.0 and NULL) to satisfy the samples_per_update, and then +it will grab the first two VALID samples (1.0 and 2.0) to satisfy the +min_true_samples.

+
+
+

Profile a Pandas DataFrame

+
import pandas as pd
+import dataprofiler as dp
+import json
+
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
+profile = dp.Profiler(my_dataframe)
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+# read a specified column, in this case it is labeled 0:
+print(json.dumps(report["data stats"][0], indent=4))
+
+
+
+
+

Specifying a Filetype or Delimiter

+

Example of specifying a CSV data type, with a , delimiter. +In addition, it utilizes only the first 10,000 rows.

+
import json
+from dataprofiler import Data, Profiler
+from dataprofiler.data_readers.csv_data import CSVData
+
+# Load a CSV file, with "," as the delimiter
+data = CSVData("your_file.csv", options={"delimiter": ","})
+
+# Split the data, such that only the first 10,000 rows are used
+data = data.data[0:10000]
+
+# Read in profile and print results
+profile = Profiler(data)
+print(json.dumps(profile.report(report_options={"output_format": "pretty"}), indent=4))
+
+
+
+
+

Setting Profiler Seed

+

Example of specifying a seed for reproducibility.

+
import dataprofiler as dp
+
+# Set seed to non-negative integer value or None
+dp.set_seed(0)
+
+
+
+
+
+

Profile Options

+

The data profiler accepts several options to toggle on and off +features. The 8 columns (int options, float options, datetime options, +text options, order options, category options, data labeler options) can be +enabled or disabled. By default, all options are toggled on. Below is an example +of how to alter these options. Options shared by structured and unstructured options +must be specified as structured or unstructured when setting (ie. datalabeler options).

+
import json
+from dataprofiler import Data, Profiler, ProfilerOptions
+
+# Load and profile a CSV file
+data = Data("your_file.csv")
+profile_options = ProfilerOptions()
+
+#All of these are different examples of adjusting the profile options
+
+# Options can be toggled directly like this:
+profile_options.structured_options.text.is_enabled = False
+profile_options.structured_options.text.vocab.is_enabled = True
+profile_options.structured_options.int.variance.is_enabled = True
+profile_options.structured_options.data_labeler.data_labeler_dirpath = \
+    "Wheres/My/Datalabeler"
+profile_options.structured_options.data_labeler.is_enabled = False
+
+# A dictionary can be sent in to set the properties for all the options
+profile_options.set({"structured_options.data_labeler.is_enabled": False, "min.is_enabled": False})
+
+# Specific columns can be set/disabled/enabled in the same way
+profile_options.structured_options.text.set({"max.is_enabled":True,
+                                         "variance.is_enabled": True})
+
+# numeric stats can be turned off/on entirely
+profile_options.set({"is_numeric_stats_enabled": False})
+profile_options.set({"int.is_numeric_stats_enabled": False})
+
+profile = Profiler(data, options=profile_options)
+
+# Print the report using json to prettify.
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+

Below is an breakdown of all the options.

+
    +
  • ProfilerOptions - The top-level options class that contains options for the Profiler class

    +
      +
    • structured_options - Options responsible for all structured data

      +
        +
      • multiprocess - Option to enable multiprocessing. Automatically selects the optimal number of processes to utilize based on system constraints.

        +
          +
        • is_enabled - (Boolean) Enables or disables multiprocessing

        • +
        +
      • +
      • int - Options for the integer columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the integer operations

        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • num_zeros - Finds the count of zeros in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_zeros

          • +
          +
        • +
        • num_negatives - Finds the count of negative numbers in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_negatives

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • float - Options for the float columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the float operations

        • +
        • precision - Finds the precision (significant figures) within the column

          +
            +
          • is_enabled - (Boolean) Enables or disables precision

          • +
          +
        • +
        • sample_ratio - (Float) The ratio of 0 to 1 how much data (identified as floats) to utilize as samples in determining precision

        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • num_zeros - Finds the count of zeros in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_zeros

          • +
          +
        • +
        • num_negatives - Finds the count of negative numbers in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables num_negatives

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • text - Options for the text columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the text operations

        • +
        • vocab - Finds all the unique characters used in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables vocab

          • +
          +
        • +
        • min - Finds minimum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables min

          • +
          +
        • +
        • max - Finds maximum value in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables max

          • +
          +
        • +
        • sum - Finds sum of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables sum

          • +
          +
        • +
        • variance - Finds variance of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables variance

          • +
          +
        • +
        • skewness - Finds skewness of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables skewness

          • +
          +
        • +
        • kurtosis - Finds kurtosis of all values in a column

          +
            +
          • is_enabled - (Boolean) Enables or disables kurtosis

          • +
          +
        • +
        • bias_correction - Applies bias correction to variance, skewness, and kurtosis calculations

          +
            +
          • is_enabled - (Boolean) Enables or disables bias correction

          • +
          +
        • +
        • histogram_and_quantiles - Generates a histogram and quantiles +from the column values

          +
            +
          • bin_count_or_method - (String/List[String]) Designates preferred method for calculating histogram bins or the number of bins to use. +If left unspecified (None) the optimal method will be chosen by attempting all methods. +If multiple specified (list) the optimal method will be chosen by attempting the provided ones. +methods: ‘auto’, ‘fd’, ‘doane’, ‘scott’, ‘rice’, ‘sturges’, ‘sqrt’ +Note: ‘auto’ is used to choose optimally between ‘fd’ and ‘sturges’

          • +
          • is_enabled - (Boolean) Enables or disables histogram and quantiles

          • +
          +
        • +
        +
      • +
      • datetime - Options for the datetime columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the datetime operations

        • +
        +
      • +
      • order - Options for the order columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the order operations

        • +
        +
      • +
      • category - Options for the category columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the category operations

        • +
        +
      • +
      • data_labeler - Options for the data labeler columns

        +
          +
        • is_enabled - (Boolean) Enables or disables the data labeler operations

        • +
        • data_labeler_dirpath - (String) Directory path to data labeler

        • +
        • data_labeler_object - (BaseDataLabeler) Datalabeler to replace +the default labeler

        • +
        • max_sample_size - (Int) The max number of samples for the data +labeler

        • +
        +
      • +
      +
    • +
    • unstructured_options - Options responsible for all unstructured data

      +
        +
      • text - Options for the text profile

        +
          +
        • is_case_sensitive - (Boolean) Specify whether the profile is case sensitive

        • +
        • stop_words - (List of Strings) List of stop words to be removed when profiling

        • +
        • top_k_chars - (Int) Number of top characters to be retrieved when profiling

        • +
        • top_k_words - (Int) Number of top words to be retrieved when profiling

        • +
        • vocab - Options for vocab count

          +
            +
          • is_enabled - (Boolean) Enables or disables the vocab stats

          • +
          +
        • +
        • words - Options for word count

          +
            +
          • is_enabled - (Boolean) Enables or disables the word stats

          • +
          +
        • +
        +
      • +
      • data_labeler - Options for the data labeler

        +
          +
        • is_enabled - (Boolean) Enables or disables the data labeler operations

        • +
        • data_labeler_dirpath - (String) Directory path to data labeler

        • +
        • data_labeler_object - (BaseDataLabeler) Datalabeler to replace +the default labeler

        • +
        • max_sample_size - (Int) The max number of samples for the data +labeler

        • +
        +
      • +
      +
    • +
    +
  • +
+
+
+

Statistical Dependency on Order of Updates

+

Some profile features/statistics are dependent on the order in which the profiler +is updated with new data.

+
+

Order Profile

+

The order profiler utilizes the last value in the previous data batch to ensure +the subsequent dataset is above/below/equal to that value when predicting +non-random order.

+

For instance, a dataset to be predicted as ascending would require the following +batch data update to be ascending and its first value >= than that of the +previous batch of data.

+

Ex. of ascending:

+
batch_1 = [0, 1, 2]
+batch_2 = [3, 4, 5]
+
+
+

Ex. of random:

+
batch_1 = [0, 1, 2]
+batch_2 = [1, 2, 3] # notice how the first value is less than the last value in the previous batch
+
+
+
+
+
+

Reporting Structure

+

For every profile, we can provide a report and customize it with a couple optional parameters:

+
    +
  • output_format (string)

    +
      +
    • This will allow the user to decide the output format for report.

      +
        +
      • Options are one of [pretty, compact, serializable, flat]:

        +
          +
        • Pretty: floats are rounded to four decimal places, and lists are shortened.

        • +
        • Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.

        • +
        • Serializable: Output is json serializable and not prettified

        • +
        • Flat: Nested output is returned as a flattened dictionary

        • +
        +
      • +
      +
    • +
    +
  • +
  • num_quantile_groups (int)

    +
      +
    • You can sample your data as you like! With a minimum of one and a maximum of 1000, you can decide the number of quantile groups!

    • +
    +
  • +
+
report  = profile.report(report_options={"output_format": "pretty"})
+report  = profile.report(report_options={"output_format": "compact"})
+report  = profile.report(report_options={"output_format": "serializable"})
+report  = profile.report(report_options={"output_format": "flat"})
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/profiler_example.html b/docs/0.7.1/html/profiler_example.html new file mode 100644 index 000000000..72a942428 --- /dev/null +++ b/docs/0.7.1/html/profiler_example.html @@ -0,0 +1,847 @@ + + + + + + + + + Structured Profilers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Structured Profilers

+

Data profiling - is the process of examining a dataset and collecting statistical or informational summaries about said dataset.

+

The Profiler class inside the DataProfiler is designed to generate data profiles via the Profiler class, which ingests either a Data class or a Pandas DataFrame.

+

Currently, the Data class supports loading the following file formats:

+
    +
  • Any delimited (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro

  • +
  • Parquet

  • +
  • Text files

  • +
  • Pandas Series/Dataframe

  • +
+

Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.

+

This example will look at specifically the structured data types for structured profiling.

+
+

Reporting

+

One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.

+

In terms of reporting, there are multiple reporting options:

+
    +
  • Pretty: Floats are rounded to four decimal places, and lists are shortened.

  • +
  • Compact: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.

  • +
  • Serializable: Output is json serializable and not prettified

  • +
  • Flat: Nested Output is returned as a flattened dictionary

  • +
+

The Pretty and Compact reports are the two most commonly used reports and includes global_stats and data_stats for the given dataset. global_stats contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. data_stats contains specific properties and statistics for each column file such as min, max, mean, variance, etc.

+

For structured profiles, the report looks like this:

+
"global_stats": {
+    "samples_used": int,
+    "column_count": int,
+    "row_count": int,
+    "row_has_null_ratio": float,
+    "row_is_null_ratio": float,
+    "unique_row_ratio": float,
+    "duplicate_row_count": int,
+    "file_type": string,
+    "encoding": string,
+},
+"data_stats": [
+    {
+        "column_name": string,
+        "data_type": string,
+        "data_label": string,
+        "categorical": bool,
+        "order": string,
+        "samples": list(str),
+        "statistics": {
+            "sample_size": int,
+            "null_count": int,
+            "null_types": list(string),
+            "null_types_index": {
+                string: list(int)
+            },
+            "data_type_representation": [string, list(string)],
+            "min": [null, float],
+            "max": [null, float],
+            "mean": float,
+            "variance": float,
+            "stddev": float,
+            "histogram": {
+                "bin_counts": list(int),
+                "bin_edges": list(float),
+            },
+            "quantiles": {
+                int: float
+            }
+            "vocab": list(char),
+            "avg_predictions": dict(float),
+            "data_label_representation": dict(float),
+            "categories": list(str),
+            "unique_count": int,
+            "unique_ratio": float,
+            "precision": {
+                'min': int,
+                'max': int,
+                'mean': float,
+                'var': float,
+                'std': float,
+                'sample_size': int,
+                'margin_of_error': float,
+                'confidence_level': float
+            },
+            "times": dict(float),
+            "format": string
+        }
+    }
+]
+
+
+

In the example, the compact format of the report is used to shorten the full list of the results.

+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+sys.path.insert(0, '..')
+import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data)
+
+# Compact - A high level view, good for quick reviews
+report  = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format pretty should be used.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+import pandas as pd
+my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=["col_int", "col_float"])
+profile = dp.Profiler(my_dataframe)
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+
+

Profiler Type

+

The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/aws_honeypot_marx_geo.csv"))
+profile = dp.Profiler(data, profiler_type='structured')
+
+# print the report using json to prettify.
+report = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+

Profiler options

+

The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the ProfilerOptions class.

+

For example, if a user doesn’t require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.

+

Below, let’s remove the histogram and increase the number of samples to the labeler component (1,000 samples).

+

Full list of options in the Profiler section of the DataProfiler documentation.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "csv/diamonds.csv"))
+
+profile_options = dp.ProfilerOptions()
+
+# Setting multiple options via set
+profile_options.set({ "histogram.is_enabled": False, "int.is_enabled": False})
+
+# Set options via directly setting them
+profile_options.structured_options.data_labeler.max_sample_size = 1000
+
+profile = dp.Profiler(data, options=profile_options)
+report  = profile.report(report_options={"output_format":"compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Updating Profiles

+

Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately.

+
+
[ ]:
+
+
+
+# Load and profile a CSV file
+data = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-header-and-author.txt"))
+profile = dp.Profiler(data)
+
+# Update the profile with new data:
+new_data = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-skip-header.txt"))
+# new_data = dp.Data(os.path.join(data_path, "iris-utf-16.csv")) # will error due to schema mismatch
+profile.update_profile(new_data)
+
+# Take a peek at the data
+print(data.data)
+print(new_data.data)
+
+# Report the compact version of the profile
+report  = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Merging Profiles

+

Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple + command: profile3 = profile1 + profile2

+
+
[ ]:
+
+
+
+# Load a CSV file with a schema
+data1 = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-header-and-author.txt"))
+profile1 = dp.Profiler(data1)
+
+# Load another CSV file with the same schema
+data2 = dp.Data(os.path.join(data_path, "csv/sparse-first-and-last-column-skip-header.txt"))
+profile2 = dp.Profiler(data2)
+
+# Merge the profiles
+profile3 = profile1 + profile2
+
+# Report the compact version of the profile
+report  = profile3.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

As you can see, the update_profile function and the + operator function similarly. The reason the + operator is important is that it’s possible to save and load profiles, which we cover next.

+
+
+

Saving and Loading a Profile

+

Not only can the Profiler create and update profiles, it’s also possible to save, load then manipulate profiles.

+
+
[ ]:
+
+
+
+# Load data
+data = dp.Data(os.path.join(data_path, "csv/names-col.txt"))
+
+# Generate a profile
+profile = dp.Profiler(data)
+
+# Save a profile to disk for later (saves as pickle file)
+profile.save(filepath="my_profile.pkl")
+
+# Load a profile from disk
+loaded_profile = dp.Profiler.load("my_profile.pkl")
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more.

+
+
[ ]:
+
+
+
+# Load a multiple files via the Data class
+filenames = ["csv/sparse-first-and-last-column-header-and-author.txt",
+             "csv/sparse-first-and-last-column-skip-header.txt"]
+data_objects = []
+for filename in filenames:
+    data_objects.append(dp.Data(os.path.join(data_path, filename)))
+
+
+# Generate and save profiles
+for i in range(len(data_objects)):
+    profile = dp.Profiler(data_objects[i])
+    profile.save(filepath="data-"+str(i)+".pkl")
+
+
+# Load profiles and add them together
+profile = None
+for i in range(len(data_objects)):
+    if profile is None:
+        profile = dp.Profiler.load("data-"+str(i)+".pkl")
+    else:
+        profile += dp.Profiler.load("data-"+str(i)+".pkl")
+
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/profiler_example.ipynb b/docs/0.7.1/html/profiler_example.ipynb new file mode 100644 index 000000000..cc4ecc218 --- /dev/null +++ b/docs/0.7.1/html/profiler_example.ipynb @@ -0,0 +1,451 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Structured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the structured data types for structured profiling. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics such as runtimes, label probabilities, index locations of null types, etc.\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as number of rows/columns, null ratio, duplicate ratio. `data_stats` contains specific properties and statistics for each column file such as min, max, mean, variance, etc.\n", + "\n", + "For structured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"column_count\": int,\n", + " \"row_count\": int,\n", + " \"row_has_null_ratio\": float,\n", + " \"row_is_null_ratio\": float, \n", + " \"unique_row_ratio\": float,\n", + " \"duplicate_row_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string,\n", + "},\n", + "\"data_stats\": [\n", + " {\n", + " \"column_name\": string,\n", + " \"data_type\": string,\n", + " \"data_label\": string,\n", + " \"categorical\": bool,\n", + " \"order\": string,\n", + " \"samples\": list(str),\n", + " \"statistics\": {\n", + " \"sample_size\": int,\n", + " \"null_count\": int,\n", + " \"null_types\": list(string),\n", + " \"null_types_index\": {\n", + " string: list(int)\n", + " },\n", + " \"data_type_representation\": [string, list(string)],\n", + " \"min\": [null, float],\n", + " \"max\": [null, float],\n", + " \"mean\": float,\n", + " \"variance\": float,\n", + " \"stddev\": float,\n", + " \"histogram\": { \n", + " \"bin_counts\": list(int),\n", + " \"bin_edges\": list(float),\n", + " },\n", + " \"quantiles\": {\n", + " int: float\n", + " }\n", + " \"vocab\": list(char),\n", + " \"avg_predictions\": dict(float), \n", + " \"data_label_representation\": dict(float),\n", + " \"categories\": list(str),\n", + " \"unique_count\": int,\n", + " \"unique_ratio\": float,\n", + " \"precision\": {\n", + " 'min': int,\n", + " 'max': int,\n", + " 'mean': float,\n", + " 'var': float,\n", + " 'std': float,\n", + " 'sample_size': int,\n", + " 'margin_of_error': float,\n", + " 'confidence_level': float\t\t\n", + " },\n", + " \"times\": dict(float),\n", + " \"format\": string\n", + " }\n", + " }\n", + "]\n", + "```\n", + "\n", + "In the example, the `compact` format of the report is used to shorten the full list of the results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Compact - A high level view, good for quick reviews\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from multiple file types, DataProfiler allows the input data as a dataframe. To get more results related to detailed predictions at the entity level from the DataLabeler component or histogram results, the format `pretty` should be used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "my_dataframe = pd.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]], columns=[\"col_int\", \"col_float\"])\n", + "profile = dp.Profiler(my_dataframe)\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "241f6e3e", + "metadata": {}, + "source": [ + "# Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "5b20879b", + "metadata": {}, + "source": [ + "The profiler will infer what type of statistics to generate (structured or unstructured) based on the input. However, you can explicitly specify profile type as well. Here is an example of the the profiler explicitly calling the structured profile." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc44eb47", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/aws_honeypot_marx_geo.csv\"))\n", + "profile = dp.Profiler(data, profiler_type='structured')\n", + "\n", + "# print the report using json to prettify.\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require histogram information they may desire to turn off the histogram functionality. Simialrly, if a user is looking for a more accurate labeling, they can increase the samples used to label.\n", + "\n", + "Below, let's remove the histogram and increase the number of samples to the labeler component (1,000 samples). \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"csv/diamonds.csv\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"histogram.is_enabled\": False, \"int.is_enabled\": False})\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.structured_options.data_labeler.max_sample_size = 1000\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "# new_data = dp.Data(os.path.join(data_path, \"iris-utf-16.csv\")) # will error due to schema mismatch\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-header-and-author.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"csv/sparse-first-and-last-column-skip-header.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"csv/names-col.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"csv/sparse-first-and-last-column-header-and-author.txt\",\n", + " \"csv/sparse-first-and-last-column-skip-header.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/0.7.1/html/py-modindex.html b/docs/0.7.1/html/py-modindex.html new file mode 100644 index 000000000..a29fe3619 --- /dev/null +++ b/docs/0.7.1/html/py-modindex.html @@ -0,0 +1,539 @@ + + + + + + + Python Module Index - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Python Module Index

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
d
+ dataprofiler +
    + dataprofiler.data_readers +
    + dataprofiler.data_readers.avro_data +
    + dataprofiler.data_readers.base_data +
    + dataprofiler.data_readers.csv_data +
    + dataprofiler.data_readers.data +
    + dataprofiler.data_readers.data_utils +
    + dataprofiler.data_readers.filepath_or_buffer +
    + dataprofiler.data_readers.json_data +
    + dataprofiler.data_readers.parquet_data +
    + dataprofiler.data_readers.structured_mixins +
    + dataprofiler.data_readers.text_data +
    + dataprofiler.dp_logging +
    + dataprofiler.labelers +
    + dataprofiler.labelers.base_data_labeler +
    + dataprofiler.labelers.base_model +
    + dataprofiler.labelers.character_level_cnn_model +
    + dataprofiler.labelers.classification_report_utils +
    + dataprofiler.labelers.data_labelers +
    + dataprofiler.labelers.data_processing +
    + dataprofiler.labelers.labeler_utils +
    + dataprofiler.labelers.regex_model +
    + dataprofiler.profilers +
    + dataprofiler.profilers.base_column_profilers +
    + dataprofiler.profilers.categorical_column_profile +
    + dataprofiler.profilers.column_profile_compilers +
    + dataprofiler.profilers.data_labeler_column_profile +
    + dataprofiler.profilers.datetime_column_profile +
    + dataprofiler.profilers.float_column_profile +
    + dataprofiler.profilers.helpers +
    + dataprofiler.profilers.helpers.report_helpers +
    + dataprofiler.profilers.histogram_utils +
    + dataprofiler.profilers.int_column_profile +
    + dataprofiler.profilers.numerical_column_stats +
    + dataprofiler.profilers.order_column_profile +
    + dataprofiler.profilers.profile_builder +
    + dataprofiler.profilers.profiler_options +
    + dataprofiler.profilers.text_column_profile +
    + dataprofiler.profilers.unstructured_labeler_profile +
    + dataprofiler.profilers.unstructured_text_profile +
    + dataprofiler.profilers.utils +
    + dataprofiler.reports +
    + dataprofiler.reports.graphs +
    + dataprofiler.settings +
    + dataprofiler.validators +
    + dataprofiler.validators.base_validators +
    + dataprofiler.version +
+ +
+
+ + + + + +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/search.html b/docs/0.7.1/html/search.html new file mode 100644 index 000000000..f0b92d985 --- /dev/null +++ b/docs/0.7.1/html/search.html @@ -0,0 +1,255 @@ + + + + + + + Search - <div class='hidden'>Data Profiler</div> <div class='version'> v0.7.1</div> + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + +
+

Error

+

+ Please activate JavaScript to enable the search functionality. +

+
+ + +
+ +
+
+ + + + + +
+
+ +
+
+ + + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/searchindex.js b/docs/0.7.1/html/searchindex.js new file mode 100644 index 000000000..04ec4e6fb --- /dev/null +++ b/docs/0.7.1/html/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({docnames:["API","add_new_model_to_data_labeler","data_labeling","data_reader","data_readers","dataprofiler","dataprofiler.data_readers","dataprofiler.data_readers.avro_data","dataprofiler.data_readers.base_data","dataprofiler.data_readers.csv_data","dataprofiler.data_readers.data","dataprofiler.data_readers.data_utils","dataprofiler.data_readers.filepath_or_buffer","dataprofiler.data_readers.json_data","dataprofiler.data_readers.parquet_data","dataprofiler.data_readers.structured_mixins","dataprofiler.data_readers.text_data","dataprofiler.dp_logging","dataprofiler.labelers","dataprofiler.labelers.base_data_labeler","dataprofiler.labelers.base_model","dataprofiler.labelers.character_level_cnn_model","dataprofiler.labelers.classification_report_utils","dataprofiler.labelers.data_labelers","dataprofiler.labelers.data_processing","dataprofiler.labelers.labeler_utils","dataprofiler.labelers.regex_model","dataprofiler.profilers","dataprofiler.profilers.base_column_profilers","dataprofiler.profilers.categorical_column_profile","dataprofiler.profilers.column_profile_compilers","dataprofiler.profilers.data_labeler_column_profile","dataprofiler.profilers.datetime_column_profile","dataprofiler.profilers.float_column_profile","dataprofiler.profilers.helpers","dataprofiler.profilers.helpers.report_helpers","dataprofiler.profilers.histogram_utils","dataprofiler.profilers.int_column_profile","dataprofiler.profilers.numerical_column_stats","dataprofiler.profilers.order_column_profile","dataprofiler.profilers.profile_builder","dataprofiler.profilers.profiler_options","dataprofiler.profilers.text_column_profile","dataprofiler.profilers.unstructured_data_labeler_column_profile","dataprofiler.profilers.unstructured_labeler_profile","dataprofiler.profilers.unstructured_text_profile","dataprofiler.profilers.utils","dataprofiler.reports","dataprofiler.reports.graphs","dataprofiler.settings","dataprofiler.validators","dataprofiler.validators.base_validators","dataprofiler.version","examples","graphs","index","install","labeler","modules","overview","profiler","profiler_example","unstructured_profiler_example"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,nbsphinx:3,sphinx:56},filenames:["API.rst","add_new_model_to_data_labeler.nblink","data_labeling.rst","data_reader.nblink","data_readers.rst","dataprofiler.rst","dataprofiler.data_readers.rst","dataprofiler.data_readers.avro_data.rst","dataprofiler.data_readers.base_data.rst","dataprofiler.data_readers.csv_data.rst","dataprofiler.data_readers.data.rst","dataprofiler.data_readers.data_utils.rst","dataprofiler.data_readers.filepath_or_buffer.rst","dataprofiler.data_readers.json_data.rst","dataprofiler.data_readers.parquet_data.rst","dataprofiler.data_readers.structured_mixins.rst","dataprofiler.data_readers.text_data.rst","dataprofiler.dp_logging.rst","dataprofiler.labelers.rst","dataprofiler.labelers.base_data_labeler.rst","dataprofiler.labelers.base_model.rst","dataprofiler.labelers.character_level_cnn_model.rst","dataprofiler.labelers.classification_report_utils.rst","dataprofiler.labelers.data_labelers.rst","dataprofiler.labelers.data_processing.rst","dataprofiler.labelers.labeler_utils.rst","dataprofiler.labelers.regex_model.rst","dataprofiler.profilers.rst","dataprofiler.profilers.base_column_profilers.rst","dataprofiler.profilers.categorical_column_profile.rst","dataprofiler.profilers.column_profile_compilers.rst","dataprofiler.profilers.data_labeler_column_profile.rst","dataprofiler.profilers.datetime_column_profile.rst","dataprofiler.profilers.float_column_profile.rst","dataprofiler.profilers.helpers.rst","dataprofiler.profilers.helpers.report_helpers.rst","dataprofiler.profilers.histogram_utils.rst","dataprofiler.profilers.int_column_profile.rst","dataprofiler.profilers.numerical_column_stats.rst","dataprofiler.profilers.order_column_profile.rst","dataprofiler.profilers.profile_builder.rst","dataprofiler.profilers.profiler_options.rst","dataprofiler.profilers.text_column_profile.rst","dataprofiler.profilers.unstructured_data_labeler_column_profile.rst","dataprofiler.profilers.unstructured_labeler_profile.rst","dataprofiler.profilers.unstructured_text_profile.rst","dataprofiler.profilers.utils.rst","dataprofiler.reports.rst","dataprofiler.reports.graphs.rst","dataprofiler.settings.rst","dataprofiler.validators.rst","dataprofiler.validators.base_validators.rst","dataprofiler.version.rst","examples.rst","graphs.rst","index.rst","install.rst","labeler.nblink","modules.rst","overview.nblink","profiler.rst","profiler_example.nblink","unstructured_profiler_example.nblink"],objects:{"":{dataprofiler:[5,0,0,"-"]},"dataprofiler.data_readers":{avro_data:[7,0,0,"-"],base_data:[8,0,0,"-"],csv_data:[9,0,0,"-"],data:[10,0,0,"-"],data_utils:[11,0,0,"-"],filepath_or_buffer:[12,0,0,"-"],json_data:[13,0,0,"-"],parquet_data:[14,0,0,"-"],structured_mixins:[15,0,0,"-"],text_data:[16,0,0,"-"]},"dataprofiler.data_readers.avro_data":{AVROData:[7,1,1,""]},"dataprofiler.data_readers.avro_data.AVROData":{data:[7,2,1,""],data_and_metadata:[7,2,1,""],data_format:[7,2,1,""],data_type:[7,3,1,""],file_encoding:[7,2,1,""],get_batch_generator:[7,4,1,""],info:[7,3,1,""],is_match:[7,4,1,""],is_structured:[7,2,1,""],length:[7,2,1,""],metadata:[7,2,1,""],reload:[7,4,1,""],selected_keys:[7,2,1,""]},"dataprofiler.data_readers.base_data":{BaseData:[8,1,1,""]},"dataprofiler.data_readers.base_data.BaseData":{data:[8,2,1,""],data_format:[8,2,1,""],data_type:[8,3,1,""],file_encoding:[8,2,1,""],get_batch_generator:[8,4,1,""],info:[8,3,1,""],is_match:[8,4,1,""],is_structured:[8,2,1,""],length:[8,2,1,""],reload:[8,4,1,""]},"dataprofiler.data_readers.csv_data":{CSVData:[9,1,1,""]},"dataprofiler.data_readers.csv_data.CSVData":{data:[9,2,1,""],data_format:[9,2,1,""],data_type:[9,3,1,""],delimiter:[9,2,1,""],file_encoding:[9,2,1,""],get_batch_generator:[9,4,1,""],header:[9,2,1,""],info:[9,3,1,""],is_match:[9,4,1,""],is_structured:[9,2,1,""],length:[9,2,1,""],quotechar:[9,2,1,""],reload:[9,4,1,""],selected_columns:[9,2,1,""]},"dataprofiler.data_readers.data":{Data:[10,1,1,""]},"dataprofiler.data_readers.data.Data":{data_classes:[10,3,1,""]},"dataprofiler.data_readers.data_utils":{convert_int_to_string:[11,5,1,""],data_generator:[11,5,1,""],detect_cell_type:[11,5,1,""],detect_file_encoding:[11,5,1,""],find_nth_loc:[11,5,1,""],generator_on_file:[11,5,1,""],get_delimiter_regex:[11,5,1,""],is_stream_buffer:[11,5,1,""],is_valid_url:[11,5,1,""],json_to_dataframe:[11,5,1,""],load_as_str_from_file:[11,5,1,""],read_csv_df:[11,5,1,""],read_json:[11,5,1,""],read_json_df:[11,5,1,""],read_parquet_df:[11,5,1,""],read_text_as_list_of_strs:[11,5,1,""],unicode_to_str:[11,5,1,""],url_to_bytes:[11,5,1,""]},"dataprofiler.data_readers.filepath_or_buffer":{FileOrBufferHandler:[12,1,1,""]},"dataprofiler.data_readers.json_data":{JSONData:[13,1,1,""]},"dataprofiler.data_readers.json_data.JSONData":{data:[13,2,1,""],data_and_metadata:[13,2,1,""],data_format:[13,2,1,""],data_type:[13,3,1,""],file_encoding:[13,2,1,""],get_batch_generator:[13,4,1,""],info:[13,3,1,""],is_match:[13,4,1,""],is_structured:[13,2,1,""],length:[13,2,1,""],metadata:[13,2,1,""],reload:[13,4,1,""],selected_keys:[13,2,1,""]},"dataprofiler.data_readers.parquet_data":{ParquetData:[14,1,1,""]},"dataprofiler.data_readers.parquet_data.ParquetData":{data:[14,2,1,""],data_format:[14,2,1,""],data_type:[14,3,1,""],file_encoding:[14,2,1,""],get_batch_generator:[14,4,1,""],info:[14,3,1,""],is_match:[14,4,1,""],is_structured:[14,2,1,""],length:[14,2,1,""],reload:[14,4,1,""],selected_columns:[14,2,1,""]},"dataprofiler.data_readers.structured_mixins":{SpreadSheetDataMixin:[15,1,1,""]},"dataprofiler.data_readers.text_data":{TextData:[16,1,1,""]},"dataprofiler.data_readers.text_data.TextData":{data:[16,2,1,""],data_format:[16,2,1,""],data_type:[16,3,1,""],file_encoding:[16,2,1,""],get_batch_generator:[16,4,1,""],info:[16,3,1,""],is_match:[16,4,1,""],is_structured:[16,2,1,""],length:[16,2,1,""],reload:[16,4,1,""],samples_per_line:[16,2,1,""],tokenize:[16,4,1,""]},"dataprofiler.dp_logging":{get_child_logger:[17,5,1,""],get_logger:[17,5,1,""],set_verbosity:[17,5,1,""]},"dataprofiler.labelers":{base_data_labeler:[19,0,0,"-"],base_model:[20,0,0,"-"],character_level_cnn_model:[21,0,0,"-"],classification_report_utils:[22,0,0,"-"],data_labelers:[23,0,0,"-"],data_processing:[24,0,0,"-"],labeler_utils:[25,0,0,"-"],regex_model:[26,0,0,"-"]},"dataprofiler.labelers.base_data_labeler":{BaseDataLabeler:[19,1,1,""],TrainableDataLabeler:[19,1,1,""]},"dataprofiler.labelers.base_data_labeler.BaseDataLabeler":{add_label:[19,4,1,""],check_pipeline:[19,4,1,""],help:[19,4,1,""],label_mapping:[19,2,1,""],labels:[19,2,1,""],load_from_disk:[19,4,1,""],load_from_library:[19,4,1,""],load_with_components:[19,4,1,""],model:[19,2,1,""],postprocessor:[19,2,1,""],predict:[19,4,1,""],preprocessor:[19,2,1,""],reverse_label_mapping:[19,2,1,""],save_to_disk:[19,4,1,""],set_labels:[19,4,1,""],set_model:[19,4,1,""],set_params:[19,4,1,""],set_postprocessor:[19,4,1,""],set_preprocessor:[19,4,1,""]},"dataprofiler.labelers.base_data_labeler.TrainableDataLabeler":{add_label:[19,4,1,""],check_pipeline:[19,4,1,""],fit:[19,4,1,""],help:[19,4,1,""],label_mapping:[19,2,1,""],labels:[19,2,1,""],load_from_disk:[19,4,1,""],load_from_library:[19,4,1,""],load_with_components:[19,4,1,""],model:[19,2,1,""],postprocessor:[19,2,1,""],predict:[19,4,1,""],preprocessor:[19,2,1,""],reverse_label_mapping:[19,2,1,""],save_to_disk:[19,4,1,""],set_labels:[19,4,1,""],set_model:[19,4,1,""],set_params:[19,4,1,""],set_postprocessor:[19,4,1,""],set_preprocessor:[19,4,1,""]},"dataprofiler.labelers.base_model":{AutoSubRegistrationMeta:[20,1,1,""],BaseModel:[20,1,1,""],BaseTrainableModel:[20,1,1,""]},"dataprofiler.labelers.base_model.AutoSubRegistrationMeta":{mro:[20,4,1,""],register:[20,4,1,""]},"dataprofiler.labelers.base_model.BaseModel":{add_label:[20,4,1,""],get_class:[20,4,1,""],get_parameters:[20,4,1,""],help:[20,4,1,""],label_mapping:[20,2,1,""],labels:[20,2,1,""],load_from_disk:[20,4,1,""],num_labels:[20,2,1,""],predict:[20,4,1,""],requires_zero_mapping:[20,3,1,""],reset_weights:[20,4,1,""],reverse_label_mapping:[20,2,1,""],save_to_disk:[20,4,1,""],set_label_mapping:[20,4,1,""],set_params:[20,4,1,""]},"dataprofiler.labelers.base_model.BaseTrainableModel":{add_label:[20,4,1,""],fit:[20,4,1,""],get_class:[20,4,1,""],get_parameters:[20,4,1,""],help:[20,4,1,""],label_mapping:[20,2,1,""],labels:[20,2,1,""],load_from_disk:[20,4,1,""],num_labels:[20,2,1,""],predict:[20,4,1,""],requires_zero_mapping:[20,3,1,""],reset_weights:[20,4,1,""],reverse_label_mapping:[20,2,1,""],save_to_disk:[20,4,1,""],set_label_mapping:[20,4,1,""],set_params:[20,4,1,""]},"dataprofiler.labelers.character_level_cnn_model":{CharacterLevelCnnModel:[21,1,1,""],F1Score:[21,1,1,""],FBetaScore:[21,1,1,""],NoV1ResourceMessageFilter:[21,1,1,""],build_embd_dictionary:[21,5,1,""],create_glove_char:[21,5,1,""]},"dataprofiler.labelers.character_level_cnn_model.CharacterLevelCnnModel":{add_label:[21,4,1,""],details:[21,4,1,""],fit:[21,4,1,""],get_class:[21,4,1,""],get_parameters:[21,4,1,""],help:[21,4,1,""],label_mapping:[21,2,1,""],labels:[21,2,1,""],load_from_disk:[21,4,1,""],num_labels:[21,2,1,""],predict:[21,4,1,""],requires_zero_mapping:[21,3,1,""],reset_weights:[21,4,1,""],reverse_label_mapping:[21,2,1,""],save_to_disk:[21,4,1,""],set_label_mapping:[21,4,1,""],set_params:[21,4,1,""]},"dataprofiler.labelers.character_level_cnn_model.F1Score":{activity_regularizer:[21,2,1,""],add_loss:[21,4,1,""],add_metric:[21,4,1,""],add_update:[21,4,1,""],add_variable:[21,4,1,""],add_weight:[21,4,1,""],apply:[21,4,1,""],build:[21,4,1,""],call:[21,4,1,""],compute_dtype:[21,2,1,""],compute_mask:[21,4,1,""],compute_output_shape:[21,4,1,""],compute_output_signature:[21,4,1,""],count_params:[21,4,1,""],dtype:[21,2,1,""],dtype_policy:[21,2,1,""],dynamic:[21,2,1,""],from_config:[21,4,1,""],get_config:[21,4,1,""],get_input_at:[21,4,1,""],get_input_mask_at:[21,4,1,""],get_input_shape_at:[21,4,1,""],get_losses_for:[21,4,1,""],get_output_at:[21,4,1,""],get_output_mask_at:[21,4,1,""],get_output_shape_at:[21,4,1,""],get_updates_for:[21,4,1,""],get_weights:[21,4,1,""],inbound_nodes:[21,2,1,""],input:[21,2,1,""],input_mask:[21,2,1,""],input_shape:[21,2,1,""],input_spec:[21,2,1,""],losses:[21,2,1,""],metrics:[21,2,1,""],name:[21,2,1,""],name_scope:[21,2,1,""],non_trainable_variables:[21,2,1,""],non_trainable_weights:[21,2,1,""],outbound_nodes:[21,2,1,""],output:[21,2,1,""],output_mask:[21,2,1,""],output_shape:[21,2,1,""],reset_state:[21,4,1,""],reset_states:[21,4,1,""],result:[21,4,1,""],set_weights:[21,4,1,""],stateful:[21,2,1,""],submodules:[21,2,1,""],supports_masking:[21,2,1,""],trainable:[21,2,1,""],trainable_variables:[21,2,1,""],trainable_weights:[21,2,1,""],update_state:[21,4,1,""],updates:[21,2,1,""],variable_dtype:[21,2,1,""],variables:[21,2,1,""],weights:[21,2,1,""],with_name_scope:[21,4,1,""]},"dataprofiler.labelers.character_level_cnn_model.FBetaScore":{activity_regularizer:[21,2,1,""],add_loss:[21,4,1,""],add_metric:[21,4,1,""],add_update:[21,4,1,""],add_variable:[21,4,1,""],add_weight:[21,4,1,""],apply:[21,4,1,""],build:[21,4,1,""],call:[21,4,1,""],compute_dtype:[21,2,1,""],compute_mask:[21,4,1,""],compute_output_shape:[21,4,1,""],compute_output_signature:[21,4,1,""],count_params:[21,4,1,""],dtype:[21,2,1,""],dtype_policy:[21,2,1,""],dynamic:[21,2,1,""],from_config:[21,4,1,""],get_config:[21,4,1,""],get_input_at:[21,4,1,""],get_input_mask_at:[21,4,1,""],get_input_shape_at:[21,4,1,""],get_losses_for:[21,4,1,""],get_output_at:[21,4,1,""],get_output_mask_at:[21,4,1,""],get_output_shape_at:[21,4,1,""],get_updates_for:[21,4,1,""],get_weights:[21,4,1,""],inbound_nodes:[21,2,1,""],input:[21,2,1,""],input_mask:[21,2,1,""],input_shape:[21,2,1,""],input_spec:[21,2,1,""],losses:[21,2,1,""],metrics:[21,2,1,""],name:[21,2,1,""],name_scope:[21,2,1,""],non_trainable_variables:[21,2,1,""],non_trainable_weights:[21,2,1,""],outbound_nodes:[21,2,1,""],output:[21,2,1,""],output_mask:[21,2,1,""],output_shape:[21,2,1,""],reset_state:[21,4,1,""],reset_states:[21,4,1,""],result:[21,4,1,""],set_weights:[21,4,1,""],stateful:[21,2,1,""],submodules:[21,2,1,""],supports_masking:[21,2,1,""],trainable:[21,2,1,""],trainable_variables:[21,2,1,""],trainable_weights:[21,2,1,""],update_state:[21,4,1,""],updates:[21,2,1,""],variable_dtype:[21,2,1,""],variables:[21,2,1,""],weights:[21,2,1,""],with_name_scope:[21,4,1,""]},"dataprofiler.labelers.character_level_cnn_model.NoV1ResourceMessageFilter":{filter:[21,4,1,""]},"dataprofiler.labelers.classification_report_utils":{classification_report:[22,5,1,""],convert_confusion_matrix_to_MCM:[22,5,1,""],precision_recall_fscore_support:[22,5,1,""]},"dataprofiler.labelers.data_labelers":{DataLabeler:[23,1,1,""],StructuredDataLabeler:[23,1,1,""],UnstructuredDataLabeler:[23,1,1,""],train_structured_labeler:[23,5,1,""]},"dataprofiler.labelers.data_labelers.DataLabeler":{labeler_classes:[23,3,1,""]},"dataprofiler.labelers.data_labelers.StructuredDataLabeler":{add_label:[23,4,1,""],check_pipeline:[23,4,1,""],help:[23,4,1,""],label_mapping:[23,2,1,""],labels:[23,2,1,""],load_from_disk:[23,4,1,""],load_from_library:[23,4,1,""],load_with_components:[23,4,1,""],model:[23,2,1,""],postprocessor:[23,2,1,""],predict:[23,4,1,""],preprocessor:[23,2,1,""],reverse_label_mapping:[23,2,1,""],save_to_disk:[23,4,1,""],set_labels:[23,4,1,""],set_model:[23,4,1,""],set_params:[23,4,1,""],set_postprocessor:[23,4,1,""],set_preprocessor:[23,4,1,""]},"dataprofiler.labelers.data_labelers.UnstructuredDataLabeler":{add_label:[23,4,1,""],check_pipeline:[23,4,1,""],help:[23,4,1,""],label_mapping:[23,2,1,""],labels:[23,2,1,""],load_from_disk:[23,4,1,""],load_from_library:[23,4,1,""],load_with_components:[23,4,1,""],model:[23,2,1,""],postprocessor:[23,2,1,""],predict:[23,4,1,""],preprocessor:[23,2,1,""],reverse_label_mapping:[23,2,1,""],save_to_disk:[23,4,1,""],set_labels:[23,4,1,""],set_model:[23,4,1,""],set_params:[23,4,1,""],set_postprocessor:[23,4,1,""],set_preprocessor:[23,4,1,""]},"dataprofiler.labelers.data_processing":{AutoSubRegistrationMeta:[24,1,1,""],BaseDataPostprocessor:[24,1,1,""],BaseDataPreprocessor:[24,1,1,""],BaseDataProcessor:[24,1,1,""],CharPostprocessor:[24,1,1,""],CharPreprocessor:[24,1,1,""],DirectPassPreprocessor:[24,1,1,""],RegexPostProcessor:[24,1,1,""],StructCharPostprocessor:[24,1,1,""],StructCharPreprocessor:[24,1,1,""]},"dataprofiler.labelers.data_processing.AutoSubRegistrationMeta":{mro:[24,4,1,""],register:[24,4,1,""]},"dataprofiler.labelers.data_processing.BaseDataPostprocessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.BaseDataPreprocessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.BaseDataProcessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.CharPostprocessor":{convert_to_NER_format:[24,4,1,""],get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],match_sentence_lengths:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.CharPreprocessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.DirectPassPreprocessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.RegexPostProcessor":{get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],priority_prediction:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""],split_prediction:[24,4,1,""]},"dataprofiler.labelers.data_processing.StructCharPostprocessor":{convert_to_structured_analysis:[24,4,1,""],get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],match_sentence_lengths:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.data_processing.StructCharPreprocessor":{convert_to_unstructured_format:[24,4,1,""],get_class:[24,4,1,""],get_parameters:[24,4,1,""],help:[24,4,1,""],load_from_disk:[24,4,1,""],load_from_library:[24,4,1,""],process:[24,4,1,""],processor_type:[24,3,1,""],save_to_disk:[24,4,1,""],set_params:[24,4,1,""]},"dataprofiler.labelers.labeler_utils":{evaluate_accuracy:[25,5,1,""],f1_report_dict_to_str:[25,5,1,""]},"dataprofiler.labelers.regex_model":{RegexModel:[26,1,1,""]},"dataprofiler.labelers.regex_model.RegexModel":{add_label:[26,4,1,""],get_class:[26,4,1,""],get_parameters:[26,4,1,""],help:[26,4,1,""],label_mapping:[26,2,1,""],labels:[26,2,1,""],load_from_disk:[26,4,1,""],num_labels:[26,2,1,""],predict:[26,4,1,""],requires_zero_mapping:[26,3,1,""],reset_weights:[26,4,1,""],reverse_label_mapping:[26,2,1,""],save_to_disk:[26,4,1,""],set_label_mapping:[26,4,1,""],set_params:[26,4,1,""]},"dataprofiler.profilers":{base_column_profilers:[28,0,0,"-"],categorical_column_profile:[29,0,0,"-"],column_profile_compilers:[30,0,0,"-"],data_labeler_column_profile:[31,0,0,"-"],datetime_column_profile:[32,0,0,"-"],float_column_profile:[33,0,0,"-"],helpers:[34,0,0,"-"],histogram_utils:[36,0,0,"-"],int_column_profile:[37,0,0,"-"],numerical_column_stats:[38,0,0,"-"],order_column_profile:[39,0,0,"-"],profile_builder:[40,0,0,"-"],profiler_options:[41,0,0,"-"],text_column_profile:[42,0,0,"-"],unstructured_labeler_profile:[44,0,0,"-"],unstructured_text_profile:[45,0,0,"-"],utils:[46,0,0,"-"]},"dataprofiler.profilers.base_column_profilers":{BaseColumnPrimitiveTypeProfiler:[28,1,1,""],BaseColumnProfiler:[28,1,1,""]},"dataprofiler.profilers.base_column_profilers.BaseColumnPrimitiveTypeProfiler":{col_type:[28,3,1,""],diff:[28,4,1,""],profile:[28,2,1,""],update:[28,4,1,""]},"dataprofiler.profilers.base_column_profilers.BaseColumnProfiler":{col_type:[28,3,1,""],diff:[28,4,1,""],profile:[28,2,1,""],update:[28,4,1,""]},"dataprofiler.profilers.categorical_column_profile":{CategoricalColumn:[29,1,1,""]},"dataprofiler.profilers.categorical_column_profile.CategoricalColumn":{categories:[29,2,1,""],col_type:[29,3,1,""],diff:[29,4,1,""],gini_impurity:[29,2,1,""],is_match:[29,2,1,""],profile:[29,2,1,""],type:[29,3,1,""],unalikeability:[29,2,1,""],unique_ratio:[29,2,1,""],update:[29,4,1,""]},"dataprofiler.profilers.column_profile_compilers":{BaseCompiler:[30,1,1,""],ColumnDataLabelerCompiler:[30,1,1,""],ColumnPrimitiveTypeProfileCompiler:[30,1,1,""],ColumnStatsProfileCompiler:[30,1,1,""],UnstructuredCompiler:[30,1,1,""]},"dataprofiler.profilers.column_profile_compilers.BaseCompiler":{diff:[30,4,1,""],profile:[30,2,1,""],update_profile:[30,4,1,""]},"dataprofiler.profilers.column_profile_compilers.ColumnDataLabelerCompiler":{diff:[30,4,1,""],profile:[30,2,1,""],update_profile:[30,4,1,""]},"dataprofiler.profilers.column_profile_compilers.ColumnPrimitiveTypeProfileCompiler":{diff:[30,4,1,""],profile:[30,2,1,""],selected_data_type:[30,2,1,""],update_profile:[30,4,1,""]},"dataprofiler.profilers.column_profile_compilers.ColumnStatsProfileCompiler":{diff:[30,4,1,""],profile:[30,2,1,""],update_profile:[30,4,1,""]},"dataprofiler.profilers.column_profile_compilers.UnstructuredCompiler":{diff:[30,4,1,""],profile:[30,2,1,""],update_profile:[30,4,1,""]},"dataprofiler.profilers.data_labeler_column_profile":{DataLabelerColumn:[31,1,1,""]},"dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn":{assert_equal_conditions:[31,4,1,""],avg_predictions:[31,2,1,""],col_type:[31,3,1,""],data_label:[31,2,1,""],diff:[31,4,1,""],label_representation:[31,2,1,""],profile:[31,2,1,""],type:[31,3,1,""],update:[31,4,1,""]},"dataprofiler.profilers.datetime_column_profile":{DateTimeColumn:[32,1,1,""]},"dataprofiler.profilers.datetime_column_profile.DateTimeColumn":{col_type:[32,3,1,""],data_type_ratio:[32,2,1,""],diff:[32,4,1,""],profile:[32,2,1,""],type:[32,3,1,""],update:[32,4,1,""]},"dataprofiler.profilers.float_column_profile":{FloatColumn:[33,1,1,""]},"dataprofiler.profilers.float_column_profile.FloatColumn":{col_type:[33,3,1,""],data_type_ratio:[33,2,1,""],diff:[33,4,1,""],is_float:[33,4,1,""],is_int:[33,4,1,""],kurtosis:[33,2,1,""],mean:[33,2,1,""],mode:[33,2,1,""],np_type_to_type:[33,4,1,""],precision:[33,2,1,""],profile:[33,2,1,""],skewness:[33,2,1,""],stddev:[33,2,1,""],type:[33,3,1,""],update:[33,4,1,""],variance:[33,2,1,""]},"dataprofiler.profilers.helpers":{report_helpers:[35,0,0,"-"]},"dataprofiler.profilers.helpers.report_helpers":{calculate_quantiles:[35,5,1,""],flat_dict:[35,5,1,""]},"dataprofiler.profilers.int_column_profile":{IntColumn:[37,1,1,""]},"dataprofiler.profilers.int_column_profile.IntColumn":{col_type:[37,3,1,""],data_type_ratio:[37,2,1,""],diff:[37,4,1,""],is_float:[37,4,1,""],is_int:[37,4,1,""],kurtosis:[37,2,1,""],mean:[37,2,1,""],mode:[37,2,1,""],np_type_to_type:[37,4,1,""],profile:[37,2,1,""],skewness:[37,2,1,""],stddev:[37,2,1,""],type:[37,3,1,""],update:[37,4,1,""],variance:[37,2,1,""]},"dataprofiler.profilers.numerical_column_stats":{NumericStatsMixin:[38,1,1,""],abstractstaticmethod:[38,1,1,""]},"dataprofiler.profilers.numerical_column_stats.NumericStatsMixin":{diff:[38,4,1,""],is_float:[38,4,1,""],is_int:[38,4,1,""],kurtosis:[38,2,1,""],mean:[38,2,1,""],mode:[38,2,1,""],np_type_to_type:[38,4,1,""],profile:[38,4,1,""],skewness:[38,2,1,""],stddev:[38,2,1,""],type:[38,3,1,""],update:[38,4,1,""],variance:[38,2,1,""]},"dataprofiler.profilers.order_column_profile":{OrderColumn:[39,1,1,""]},"dataprofiler.profilers.order_column_profile.OrderColumn":{col_type:[39,3,1,""],diff:[39,4,1,""],profile:[39,2,1,""],type:[39,3,1,""],update:[39,4,1,""]},"dataprofiler.profilers.profile_builder":{BaseProfiler:[40,1,1,""],Profiler:[40,1,1,""],StructuredColProfiler:[40,1,1,""],StructuredProfiler:[40,1,1,""],UnstructuredProfiler:[40,1,1,""]},"dataprofiler.profilers.profile_builder.BaseProfiler":{diff:[40,4,1,""],load:[40,4,1,""],profile:[40,2,1,""],report:[40,4,1,""],save:[40,4,1,""],update_profile:[40,4,1,""]},"dataprofiler.profilers.profile_builder.Profiler":{load:[40,4,1,""]},"dataprofiler.profilers.profile_builder.StructuredColProfiler":{clean_data_and_get_base_stats:[40,4,1,""],diff:[40,4,1,""],profile:[40,2,1,""],update_column_profilers:[40,4,1,""],update_profile:[40,4,1,""]},"dataprofiler.profilers.profile_builder.StructuredProfiler":{diff:[40,4,1,""],load:[40,4,1,""],profile:[40,2,1,""],report:[40,4,1,""],save:[40,4,1,""],update_profile:[40,4,1,""]},"dataprofiler.profilers.profile_builder.UnstructuredProfiler":{diff:[40,4,1,""],load:[40,4,1,""],profile:[40,2,1,""],report:[40,4,1,""],save:[40,4,1,""],update_profile:[40,4,1,""]},"dataprofiler.profilers.profiler_options":{BaseInspectorOptions:[41,1,1,""],BaseOption:[41,1,1,""],BooleanOption:[41,1,1,""],CategoricalOptions:[41,1,1,""],CorrelationOptions:[41,1,1,""],DataLabelerOptions:[41,1,1,""],DateTimeOptions:[41,1,1,""],FloatOptions:[41,1,1,""],HistogramOption:[41,1,1,""],IntOptions:[41,1,1,""],ModeOption:[41,1,1,""],NumericalOptions:[41,1,1,""],OrderOptions:[41,1,1,""],PrecisionOptions:[41,1,1,""],ProfilerOptions:[41,1,1,""],StructuredOptions:[41,1,1,""],TextOptions:[41,1,1,""],TextProfilerOptions:[41,1,1,""],UnstructuredOptions:[41,1,1,""]},"dataprofiler.profilers.profiler_options.BaseInspectorOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.BaseOption":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.BooleanOption":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.CategoricalOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.CorrelationOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.DataLabelerOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.DateTimeOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.FloatOptions":{is_numeric_stats_enabled:[41,2,1,""],is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.HistogramOption":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.IntOptions":{is_numeric_stats_enabled:[41,2,1,""],is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.ModeOption":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.NumericalOptions":{is_numeric_stats_enabled:[41,2,1,""],is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.OrderOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.PrecisionOptions":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.ProfilerOptions":{properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.StructuredOptions":{enabled_profiles:[41,2,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.TextOptions":{is_numeric_stats_enabled:[41,2,1,""],is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.TextProfilerOptions":{is_prop_enabled:[41,4,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.profiler_options.UnstructuredOptions":{enabled_profiles:[41,2,1,""],properties:[41,2,1,""],set:[41,4,1,""],validate:[41,4,1,""]},"dataprofiler.profilers.text_column_profile":{TextColumn:[42,1,1,""]},"dataprofiler.profilers.text_column_profile.TextColumn":{col_type:[42,3,1,""],data_type_ratio:[42,2,1,""],diff:[42,4,1,""],is_float:[42,4,1,""],is_int:[42,4,1,""],kurtosis:[42,2,1,""],mean:[42,2,1,""],mode:[42,2,1,""],np_type_to_type:[42,4,1,""],profile:[42,2,1,""],skewness:[42,2,1,""],stddev:[42,2,1,""],type:[42,3,1,""],update:[42,4,1,""],variance:[42,2,1,""]},"dataprofiler.profilers.unstructured_labeler_profile":{UnstructuredLabelerProfile:[44,1,1,""]},"dataprofiler.profilers.unstructured_labeler_profile.UnstructuredLabelerProfile":{diff:[44,4,1,""],label_encoding:[44,2,1,""],profile:[44,2,1,""],type:[44,3,1,""],update:[44,4,1,""]},"dataprofiler.profilers.unstructured_text_profile":{TextProfiler:[45,1,1,""]},"dataprofiler.profilers.unstructured_text_profile.TextProfiler":{diff:[45,4,1,""],profile:[45,2,1,""],type:[45,3,1,""],update:[45,4,1,""]},"dataprofiler.profilers.utils":{KeyDict:[46,1,1,""],add_nested_dictionaries:[46,5,1,""],biased_kurt:[46,5,1,""],biased_skew:[46,5,1,""],dict_merge:[46,5,1,""],find_diff_of_dates:[46,5,1,""],find_diff_of_dicts:[46,5,1,""],find_diff_of_dicts_with_diff_keys:[46,5,1,""],find_diff_of_lists_and_sets:[46,5,1,""],find_diff_of_matrices:[46,5,1,""],find_diff_of_numbers:[46,5,1,""],find_diff_of_strings_and_bools:[46,5,1,""],generate_pool:[46,5,1,""],get_memory_size:[46,5,1,""],method_timeit:[46,5,1,""],overlap:[46,5,1,""],partition:[46,5,1,""],shuffle_in_chunks:[46,5,1,""],suggest_pool_size:[46,5,1,""],warn_on_profile:[46,5,1,""]},"dataprofiler.profilers.utils.KeyDict":{clear:[46,4,1,""],copy:[46,4,1,""],default_factory:[46,3,1,""],fromkeys:[46,4,1,""],get:[46,4,1,""],items:[46,4,1,""],keys:[46,4,1,""],pop:[46,4,1,""],popitem:[46,4,1,""],setdefault:[46,4,1,""],update:[46,4,1,""],values:[46,4,1,""]},"dataprofiler.reports":{graphs:[48,0,0,"-"]},"dataprofiler.reports.graphs":{plot_col_histogram:[48,5,1,""],plot_histograms:[48,5,1,""]},"dataprofiler.validators":{base_validators:[51,0,0,"-"]},"dataprofiler.validators.base_validators":{Validator:[51,1,1,""],is_in_list:[51,5,1,""],is_in_range:[51,5,1,""]},"dataprofiler.validators.base_validators.Validator":{get:[51,4,1,""],validate:[51,4,1,""]},dataprofiler:{data_readers:[6,0,0,"-"],dp_logging:[17,0,0,"-"],labelers:[18,0,0,"-"],profilers:[27,0,0,"-"],reports:[47,0,0,"-"],set_seed:[5,5,1,""],settings:[49,0,0,"-"],validators:[50,0,0,"-"],version:[52,0,0,"-"]}},objnames:{"0":["py","module","Python module"],"1":["py","class","Python class"],"2":["py","property","Python property"],"3":["py","attribute","Python attribute"],"4":["py","method","Python method"],"5":["py","function","Python function"]},objtypes:{"0":"py:module","1":"py:class","2":"py:property","3":"py:attribute","4":"py:method","5":"py:function"},terms:{"0":[1,2,3,4,11,12,19,21,22,24,25,29,33,36,37,38,40,42,54,55,56,57,59,60,61,62],"00":[11,25,33,37,38,42],"000":[55,60,61],"000043219499392912":2,"01":46,"05":3,"07":[3,57],"0700":57,"0f":57,"0m":3,"0s":1,"1":[1,3,4,11,19,20,21,22,23,24,25,29,31,33,37,38,42,51,54,55,57,59,60,61],"10":[2,3,11,21,54,55,57,60],"100":57,"1000":[9,13,59,60,61],"10000":[55,60],"1024":11,"11":57,"11111111":57,"1111111111111":57,"12":21,"123":57,"1234":2,"127":1,"13":[3,57],"15":[3,59],"16":[57,61],"18":[2,55],"19":36,"2":[1,2,3,19,21,22,23,29,30,40,46,51,54,55,57,59,60,61],"20":[11,21],"200":4,"2004":22,"2005":[29,57],"2014":3,"2018":46,"2019":21,"21":3,"22":22,"23":3,"27":29,"29":3,"2x2":22,"3":[2,3,21,22,25,51,54,55,57,59,60,61],"30":[21,22],"301":2,"31":[3,57],"31m":3,"32":[1,19,20,21,23,24],"326":3,"327":3,"33":25,"334":3,"335":3,"34":3,"3400":[1,24],"35":3,"37":57,"3f":57,"3x":62,"4":[0,3,21,25,55,57,59,60,61,62],"40":25,"43":3,"456":57,"5":[3,22,41,54,55,60],"50":25,"5000":40,"55":3,"56":3,"57":25,"58":[3,29],"59":3,"6":55,"60":[13,25,29],"61":3,"62":3,"63":3,"64":1,"65":3,"65536":11,"67":[22,25],"7":[55,57],"75":[3,24,25],"7890":57,"7bit":57,"8":[3,11,28,38,40,41,51,60],"80":[3,25,33,37,38,42],"84":3,"85":59,"89":3,"9":3,"92m":3,"95":3,"97":2,"98":3,"\u03c3":29,"abstract":[2,8,20,24,28,30,38],"boolean":[1,4,11,25,41,60],"break":1,"byte":11,"case":[1,3,21,22,24,41,42,46,55,60],"char":[11,20,21,24,26,55,61,62],"class":[1,2,3,4,7,8,9,10,12,13,14,15,16,18,19,20,21,22,23,24,25,26,28,29,30,31,32,33,37,38,39,40,41,42,44,45,46,48,51,53,55,57,60,61,62],"default":[1,2,3,4,9,21,22,24,41,46,55,56,57,60],"do":[21,41,54],"final":[1,21],"float":[1,2,5,11,19,21,22,24,25,27,32,37,38,40,41,42,46,48,51,55,57,60,61,62],"function":[1,2,3,11,15,19,20,21,22,23,24,25,26,33,35,36,37,38,41,42,46,54,57,59,61,62],"import":[1,2,3,4,55,57,59,60,61,62],"int":[1,4,5,9,11,12,16,17,19,20,21,22,23,24,25,27,29,32,33,38,40,41,42,46,48,51,55,57,59,60,61,62],"long":1,"new":[7,8,9,13,14,16,19,20,21,23,26,29,46,53,55,57,60,61,62],"null":[38,40,41,55,59,60,61,62],"return":[1,3,4,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,28,29,30,31,32,33,35,37,38,39,40,41,42,44,45,46,48,51,55,57,60,61,62],"short":1,"static":[24,31,33,37,38,40,42],"super":21,"switch":57,"true":[1,2,3,4,11,16,19,21,22,23,24,25,26,29,33,37,38,41,42,46,55,57,60,62],"try":[3,21,59],"var":[55,61],"while":[1,2,3,22,59],A:[4,21,22,24,26,46,53,55,59,60,61],AS:21,As:[3,21,57,59,61,62],At:13,By:[2,3,22,57,60],For:[2,3,4,21,22,29,33,37,38,42,46,56,57,59,60,61,62],IS:21,If:[11,21,22,31,41,46,55,56,60],In:[1,2,3,11,21,22,24,46,55,57,59,60,61,62],Is:3,It:[2,4,21,55,57,61,62],NOT:21,No:[29,51,60],Not:[61,62],OF:21,OR:[11,21],One:[59,60,61,62],Or:[5,6],The:[0,1,2,3,4,11,18,21,22,41,46,51,54,55,56,57,59,60,61,62],There:[3,55,57,60],These:[3,21,53,54,62],To:[2,55,56,57,59,61,62],Will:[2,41],With:[54,60,61,62],_:35,__call__:21,__init__:[1,2,21],__missing__:46,_argmax_threshold_lay:1,_char_encoding_lay:1,_construct_model:[1,2],_epoch_id:1,_model:1,_model_default_ind:1,_model_num_label:1,_need_to_reconstruct_model:2,_paramet:1,_postprocessor:1,_preprocessor:1,_reconstruct_model:2,_save_processor:2,_validate_paramet:[1,2],a_natural_but_biased_estim:46,a_out:21,ab:21,abc:[3,20,24,57],abcmeta:[20,24],abil:[3,59,61,62],abl:[2,19,21,57],about:[2,3,21,55,57,61,62],abov:[1,4,21,55,56,57,60],abstractstaticmethod:38,acc:1,accept:[1,2,21,60],access:[4,17,21,53,55,60],accomplish:[61,62],account:[2,22,46,55],accumul:21,accur:61,accuraci:[22,25,57],across:[24,51],activ:[1,3,21,56],activity_regular:21,ad:[11,19,20,21,23,26,40,53,61,62],adam:1,adapt:[1,21],add:[1,2,15,19,20,21,23,26,46,55,57,61,62],add_label:[2,19,20,21,23,26,57],add_loss:21,add_metr:21,add_nested_dictionari:46,add_true_false_color:3,add_upd:21,add_vari:21,add_weight:21,addit:[1,3,11,21,25,55,57,59,60,61,62],addition:[3,4,40,59],addon:21,address:[2,55,57],adjust:60,advanc:22,aforement:59,after:[53,59],again:57,aggreg:[21,24],aggregation_func:24,agre:21,algorithm:3,alia:21,all:[1,3,7,8,9,11,13,14,16,20,21,22,24,26,31,40,41,42,46,54,56,57,59,60],all_fil:[3,59],alloc:46,allow:[2,3,19,20,21,23,24,40,46,57,59,60,61,62],along:[38,40,51,55,60],alreadi:[1,56],also:[1,2,3,4,7,9,13,14,16,21,22,41,51,54,55,57,59,60,61,62],alter:[19,20,21,22,23,24,26,53,60],altern:[61,62],although:41,alwai:[12,42],amount:60,an:[1,7,8,9,11,13,14,15,16,20,21,22,24,26,29,33,37,38,39,42,46,51,55,57,59,60,61,62],analysi:[24,55,60],analyz:[61,62],ani:[2,3,4,9,14,21,24,26,40,41,55,56,57,59,60,61,62],anoth:[21,29,55,57,60,61,62],anyth:24,apach:21,api:[21,55],append:[1,57,61,62],appli:[21,24,26,40,60],applic:[21,22,41,55,59,61,62],appropri:[20,21,26,31,32,39,41,57,61,62],approxim:24,apt:56,ar:[1,2,3,4,9,11,20,21,22,24,26,29,40,46,48,54,55,56,57,59,60,61,62],arbitrari:46,architectur:[1,2],aren:21,arg:[21,24],argmax:[1,21,24],argmax_lay:1,argmax_output:1,argument:[11,19,21,23,46],arrai:[2,21,22,25,46],ascend:60,ascii:57,ascii_num:1,aspect:[61,62],assert_equal_condit:31,assign:46,assist:2,associ:[19,21,23,26],assum:[21,24,30,46,59],astyp:[1,57],attempt:60,attr:[20,24,41],attribut:[21,53],attributeerror:[3,21],attributt:[55,60],aug:57,august:57,author:[3,21,59,61],auto:[3,4,10,41,55,60],automat:[21,53,55,60],autosubregistrationmeta:[20,24],avail:[4,54],averag:[1,21,22,31],avg:25,avg_predict:[31,55,60,61],avro:[3,4,5,6,14,54,55,56,59,60,61,62],avro_data:[7,10,59],avro_fil:[3,59],avrodata:[3,7,10,59],aw:[1,57],aws_honeypot_marx_geo:[1,3,59,61],ax:[48,54],b:[21,46,54],b_out:21,back:21,backend:1,background:[24,26],backward:21,ban:[2,55],bank:[2,55],base:[2,4,5,6,7,9,10,12,13,14,15,16,18,21,23,24,26,27,29,30,31,32,33,37,38,39,40,41,42,44,45,46,50,59,60,61],base_column_profil:[28,29,31,32,33,37,39,42],base_data:[7,8,9,13,14,16,40],base_data_label:[19,23],base_model:[1,2,19,20,21,23,26],base_valid:51,basecolumnprimitivetypeprofil:[28,32,33,37,42],basecolumnprofil:[28,29,31,32,38,39,42],basecompil:30,basedata:[7,8,9,13,14,16,40],basedatalabel:[18,19,23,41,60],basedatapostprocessor:[2,19,23,24],basedatapreprocessor:[2,19,23,24],basedataprocessor:24,baseinspectoropt:41,basemodel:[1,2,19,20,23,26,57],baseopt:41,baseprocessor:57,baseprofil:40,basetrainablemodel:[20,21],basi:21,basic:[2,57],batch:[19,20,21,23,24,55,59,60],batch_1:60,batch_2:60,batch_data:24,batch_siz:[7,8,9,13,14,16,19,20,21,23,24,26],batchnorm:21,bc:3,bcc:57,bdist:56,bdist_wheel:56,becaus:[3,21,22,46],becom:[21,24,46],been:[3,21],befor:[2,21],being:[3,4,7,8,9,11,12,13,14,15,16,19,20,21,22,23,24,26,31,40,41],belong:2,below:[3,4,21,22,24,41,54,55,57,59,60,61,62],besid:[57,59],best:[3,4,31,55],beta:[21,22],better:57,between:[2,3,21,22,24,30,31,32,39,40,41,44,45,46,57,60],beyond:[61,62],bfloat16:21,bia:[21,41,60],bias:46,bias_correct:[41,60],biased_kurt:46,biased_skew:46,big:57,bin:[41,56,59,60],bin_count:[55,61],bin_count_or_method:[41,59,60],bin_edg:[55,61],binari:22,bit:21,blob:[21,36],blog:3,blogpost:3,bool:[7,9,11,13,14,16,19,20,21,22,23,24,26,33,37,38,41,42,46,51,55,61],booleanopt:41,both:[3,11,21,41,57,60],bound:46,breadth:21,breakdown:60,breviti:57,brew:56,broken:59,buffer:[5,6,11],buffer_s:11,bufferediobas:12,bug:55,build:[1,11,21,22,38,40,51,53],build_embd_dictionari:[1,21],builder:[5,27],built:[1,18,21],builtin:2,byte_s:11,bytesio:[11,12],c:[21,54],cach:21,calcul:[2,21,22,29,32,33,37,40,41,42,46,55,60,61,62],calculate_quantil:35,call:[2,4,11,21,22,41,46,59,61],callabl:[21,46],can:[1,2,3,4,7,8,9,13,14,15,16,21,22,40,42,46,54,55,56,57,59,60,61,62],candid:21,cannot:[2,11,21],capabl:[3,21],capitalon:[3,56],carat:3,cast:21,categor:[5,27,41,55,60,61],categori:[1,29,41,55,57,59,60,61],categorical_column_profil:29,categorical_count:[29,55,60],categorical_crossentropi:1,categoricalcolumn:29,categoricalopt:41,caus:[21,41],cc:57,cdot:21,cell:[2,11,55,57],certain:21,chang:[2,3,21,30,33,37,38,40,42,61,62],char_in_vector:1,charact:[2,3,5,9,11,18,24,26,35,40,41,53,55,57,59,60],character_argmax:24,character_level_cnn_model:[1,2,21,57],characterlevelcnnmodel:[1,2,18,21,24,57],characterlevellstmmodel:1,charpostprocessor:24,charpreprocessor:[18,24],charset:57,check:[1,2,7,9,11,13,14,19,21,23,29,33,37,38,41,42,51,53,57,59],check_pipelin:[2,19,23,57],chi2:60,child:17,children:21,choic:[3,4,7,9,13,14,16],choos:[3,24,60],chose:24,chosen:[60,62],chr:1,chunk:[4,11,16,46],chunk_siz:46,chunk_size_byt:11,cij:29,clariti:3,class_nam:[20,21,24,26],classif:[5,18,21,29],classifi:2,classification_report:[22,25],classification_report_util:22,classmethod:[7,8,9,13,14,16,19,20,21,23,24,26,40],clean_data_and_get_base_stat:40,clean_sampled_df:40,clear:46,clear_sess:1,clsname:[20,24],cn:57,cnn:[1,5,18],code:[3,21,22,28,38,40,41,46,51,55,57,59],col:[40,46,61],col_float:[59,61],col_int:[59,61],col_profil:46,col_report:[1,57],col_typ:[28,29,31,32,33,37,39,42],collaps:35,collect:[21,46,59,61,62],color:3,column:[1,2,4,5,9,11,14,22,24,27,40,41,44,45,46,48,51,53,54,55,57,59,60,61,62],column_count:[55,59,60,61],column_nam:[1,51,55,57,60,61],column_profile_compil:30,columnar:1,columndatalabelercompil:30,columnprimitivetypeprofilecompil:30,columnstatsprofilecompil:30,com:[4,21,36,46,56,57],combin:[1,3,9,46,59],come:[55,59],command:[4,55,61,62],comment:1,common:41,commonli:[3,61,62],compact:[40,59,60,61,62],compani:57,compar:25,compat:[2,7,8,9,13,14,16,21,55,57],compen:57,compil:[1,5,27],complex:2,complex_nest:[3,59],complianc:21,compon:[0,1,19,21,22,23,57,59,61,62],compos:21,compris:57,comput:[21,22,59],compute_dtyp:21,compute_mask:21,compute_output_shap:21,compute_output_signatur:21,concaten:35,conclus:53,condit:[21,60],conduct:[3,24],conf:[2,24,57],conf_matrix:22,confid:[1,2,20,21,24,26,57],confidence_level:[55,61],config:[21,51],configur:51,conflict:41,confus:[22,25],confusion_matrix:22,confusion_matrix_fil:25,connect:[1,19,21,23],conserv:60,consid:[1,11,16,21,26,41,42],constant_initi:21,constraint:60,construct:[1,2,21,41],constructor:[1,21],contain:[1,2,3,4,7,13,19,20,21,23,24,26,31,32,39,41,46,52,54,55,57,59,60,61,62],content:57,context:12,continu:[2,60],contrast:60,conv2d:21,convert:[2,3,11,20,21,22,24,25,26,33,37,38,42,57],convert_confusion_matrix_to_mcm:22,convert_int_to_str:11,convert_to_ner_format:24,convert_to_structured_analysi:24,convert_to_tensor:21,convert_to_unstructured_format:24,convolut:1,copi:[21,22,41,46],copyright:21,core:[29,30,31,32,33,37,38,39,40,42,45],correct:60,correctli:3,correl:[41,55],correlation_matrix:[55,60],correlationopt:41,correspond:[21,22,24,46,59],cost:[46,57],could:[24,31,61,62],count:[21,22,41,44,60,62],count_param:21,coupl:60,cover:[59,61,62],cpu:46,cpu_count:46,creat:[1,2,21,23,24,26,46,57,60,61,62],create_glove_char:[1,21],creation:21,credit_card:[2,55],critic:17,cross:19,csv:[1,2,3,4,5,6,11,54,55,57,59,60,61,62],csv_data:[3,4,9,10,55,59,60],csv_file:[3,59],csvdata:[9,10,53,55,59,60],current:[2,4,20,21,24,26,54,55,60,61,62],custom:[55,59,60],cut:[3,55],d:[1,21,46,54],dai:46,daili:3,dask:51,data1:[55,60,61,62],data2:[55,60,61,62],data:[0,1,5,15,18,20,21,22,26,27,28,29,30,32,33,37,38,39,40,41,42,44,45,46,51,53,56,58,61,62],data_and_metadata:[4,7,13],data_as_str:11,data_class:10,data_fold:3,data_format:[4,7,8,9,13,14,16,53,62],data_gener:11,data_label:[1,2,23,31,41,44,55,57,59,60,61,62],data_label_represent:[55,61],data_labeler2:31,data_labeler_column_profil:31,data_labeler_dirpath:[41,44,57,60],data_labeler_object:[1,41,57,60],data_labeler_sav:[1,57],data_length:46,data_list:11,data_object:[61,62],data_path:[59,61,62],data_process:[2,19,23,24,57],data_processor:[19,23],data_read:[3,4,7,8,9,10,11,12,13,14,15,16,40,55,56,59,60],data_s:46,data_stat:[1,55,57,59,60,61,62],data_test:[1,57],data_train:[1,57],data_typ:[3,7,8,9,10,13,14,16,30,55,59,60,61],data_type_profil:48,data_type_ratio:[32,33,37,42],data_type_represent:[55,60,61],data_util:11,datafram:[1,4,7,9,11,13,14,19,20,23,24,28,40,46,51,53,57,59,61,62],datalabel:[2,19,23,31,41,53,57,60,61],datalabelercolumn:31,datalabeleropt:[31,41,44],dataprofil:[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,28,29,30,31,32,33,35,37,38,39,40,41,42,44,45,46,48,51,54,55,56,57,59,60,61,62],dataprofiler_se:56,dataset:[2,3,4,7,8,9,13,14,15,16,19,21,28,29,31,32,33,37,38,39,40,41,42,44,46,51,53,55,56,57,59,60,61,62],date:[2,3,46,55,57],datetim:[1,2,3,5,27,41,46,55,57,60],datetime_column_profil:32,datetimecolumn:32,datetimeopt:[32,41],dct:46,deal:57,debug:[17,25],decid:[41,60],decim:[40,60,61,62],deciph:[3,9],decor:[20,21,24],deem:21,deep:55,deeper:53,def:[1,3,21,57],default_factori:46,default_ind:1,default_label:[1,21,23,24,26,57],defaultdict:46,defin:[1,4,21,41,54],definit:46,delet:56,delimit:[4,9,11,53,59,61,62],delimti:3,demo:57,demonstr:[3,54,59],dens:[1,21],depart:57,depend:[21,40,56,57],deprec:21,depth:[3,46,53],deriv:1,descend:29,descent:21,describ:[12,19,20,21,22,23,24,26],descript:[3,4,59],design:[6,11,55,57,59,60,61,62],desir:[1,3,55,59,61,62],detail:[2,3,21,22,40,53,59,60,61,62],detect:[4,7,10,11,14,53,55,56,59,60,61,62],detect_cell_typ:11,detect_file_encod:11,determin:[2,3,7,8,9,11,13,14,16,19,20,21,22,23,24,26,31,41,55,60,61,62],dev:56,devid:59,df1:59,df2:59,df:[3,31,32,33,37,38,39,42,45,57,59,60],df_data:[1,57],df_result:[1,57],df_seri:[28,29,30,31,32,33,37,38,39,40,42,44,46],diamond:[3,61],dict1:46,dict2:46,dict:[2,3,4,7,8,9,11,13,14,15,16,19,20,21,22,23,24,25,26,28,29,30,31,32,33,35,37,38,39,40,41,42,44,45,46,51,55,59,60,61,62],dict_merg:46,dictionari:[3,4,11,13,19,21,22,23,25,35,40,41,46,55,60,61,62],diff:[28,29,30,31,32,33,37,38,39,40,42,44,45,60],diff_report:60,differ:[1,3,21,28,29,30,31,32,33,37,38,39,40,42,44,45,46,51,55,57,59],differenti:31,digest:2,digit:[2,22,55],dim:21,dim_emb:[1,21,26],dimens:21,dir:25,directli:[3,21,54,55,57,59,60,61,62],directori:[20,21,26,44,57,60],directpasspreprocessor:[18,24],dirpath:[2,19,20,21,23,24,26,57],disabl:[1,21,41,55,56,57,59,60],discov:56,discoveri:22,discrimin:22,discuss:3,discussion_reddit:[3,59,62],disk:[2,18,19,20,21,23,24,26,40,61,62],displai:[3,22,29,41,57],dist:56,distribut:[21,31,55,56,59,60],dive:[53,57],doan:60,doc:3,document:[57,61,62],doe:[21,22,26,46],doesn:[21,61,62],domin:24,don:[21,55,56],done:21,doubl:56,down:[21,46],download:11,downstream:55,dp:[1,2,3,4,5,54,55,57,58,59,60,61,62],dp_log:17,drivers_licens:[2,55],drmaciv:46,drop:[1,40,57],dropout:[1,21],dtype:[1,11,21,54],dtype_polici:21,due:[46,61],dump:[55,59,60,61,62],duplic:[46,59,61,62],duplicate_row_count:[55,59,60,61],dure:[2,21],dynam:21,e:[2,3,19,21,23,46,57],each:[2,3,11,19,20,21,22,23,24,26,31,33,35,46,55,57,59,60,61,62],eager:21,earlier:57,easi:[4,55,57,60],easier:21,easili:[57,59,60],edg:55,educ:57,effici:55,egg:56,either:[11,19,21,23,46,56,61,62],element:[21,33],elif:1,els:[1,46,59,61,62],email:57,email_address:[2,55],emb:21,embed:[1,21,24],embed_fil:1,embedding_dict:1,embedding_matrix:1,empti:[1,3,46,59],empty_line_count:[55,60,62],en:46,enabl:[21,41,55,59,60],enabled_profil:41,encapsul:26,encod:[1,2,3,7,11,12,14,19,20,21,23,24,25,26,55,57,60,61,62],encoding_funct:1,end:[2,24,26,51,57],enron:57,ensur:[3,11,31,57,60],enter:21,entir:[3,4,7,9,13,14,60],entiti:[24,25,26,44,55,56,57,59,60,61,62],entity_count:[55,60,62],entity_percentag:[55,60],entity_priority_ord:24,entity_rev_dict:25,entri:[21,22,55],env:56,environ:59,epoch:[1,2,19,20,21,23,57],epoch_id:[2,21],equal:[24,59,60],equip:[55,60],equival:21,eras:[7,8,9,13,14,16],error:[1,3,17,19,21,23,31,41,46,56,57,59,61],error_on_mismatch:[19,23],estim:[33,37,38,41,42,46],etc:[2,3,11,21,24,40,46,55,59,60,61,62],evalu:[1,2,21,25],evaluate_accuraci:25,evan:57,event:21,ever:57,everi:[11,21,60,61,62],ex:60,exact:60,exactli:[21,59],examin:[7,9,13,14,16,61,62],exampl:[1,2,3,4,21,22,25,26,51,55,57,59,60,61,62],except:[1,2,3,21,24,46],exclud:22,execut:[2,21,46],exist:[7,8,9,13,14,16,19,20,21,23,24,26,41,55,57],expect:[2,11,21,57,59],explicit:60,explicitli:[59,61,62],explor:[1,55],express:21,extend:57,extens:[3,59],extern:21,extra:[1,3,9],extract:[20,21,26],extran:2,f1:[1,22,25,57],f1_report:25,f1_report_dict_to_str:25,f1_score_train:1,f1score:[1,21],f:[21,22,46],f_1:21,f_:21,f_score:21,factor:21,factori:[10,40,46],fail:3,failur:3,faliur:3,fall:21,fals:[1,3,4,11,19,20,21,22,23,24,26,33,37,38,41,42,57,59,60,61,62],fashion:51,fbeta_scor:22,fbetascor:21,fd:60,featur:[59,60],fed:40,feed:57,few:[55,57],field:3,fig:[48,54],figur:[33,54,60],file:[1,4,7,8,9,10,11,12,13,14,15,16,17,18,19,21,23,25,40,52,53,54,56,57,59,61,62],file_a:[55,60],file_b:[55,60],file_encod:[3,7,8,9,11,13,14,16],file_object:11,file_path:[7,9,11,13,14,16],file_typ:[55,59,60,61,62],filenam:[21,57,61,62],fileorbufferhandl:12,filepath:[3,5,6,11,17,40,60,61,62],filepath_or_buff:[11,12],fill:1,filter:21,final_confid:2,final_predicted_lay:1,final_result:2,find:[11,21,22,28,29,30,33,37,38,40,42,44,45,46,55,60],find_diff_of_d:46,find_diff_of_dict:46,find_diff_of_dicts_with_diff_kei:46,find_diff_of_lists_and_set:46,find_diff_of_matric:46,find_diff_of_numb:46,find_diff_of_strings_and_bool:46,find_nth_loc:11,finetun:3,first:[1,3,9,13,21,31,46,54,55,57,59,60,61],first_dict:46,fisher:46,fit:[1,2,19,20,21,31,57],fix:26,flag:[1,2,19,20,21,23,24,26,40,41],flat:[40,60,61,62],flat_dict:35,flatten:[24,35,40,60,61,62],flatten_separ:24,flatten_split:24,flattened_datafram:[4,13],float16:21,float32:21,float_column_profil:33,floatcolumn:[33,48],floatopt:[33,41],fly:21,focu:3,folder:[1,56,57],follow:[1,2,3,4,18,19,21,22,23,24,46,54,57,59,60,61,62],form:[11,56],formal:46,format:[1,2,3,4,7,9,13,14,16,19,21,22,23,24,31,32,39,40,41,57,59,60,61,62],former:59,found:[4,21,31,46,57,60],four:[40,60,61,62],fp:3,frac:[1,21,57],frame:[7,13,30],framework:21,free:21,frequent:29,fri:57,fridai:57,from:[1,2,3,4,7,9,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,29,30,40,41,46,53,55,59,60,61,62],from_config:21,fromkei:46,fscore:22,full:[21,55,56,59,61,62],fulli:1,func:22,further:[61,62],g:[3,21,29,46,57],g_1:46,g_2:46,game:3,gather:59,gaussian:3,gener:[1,11,21,24,25,31,32,39,40,46,55,59,60,61,62],generate_pool:46,generator_on_fil:11,get:[1,2,21,46,51,54,56,57,59,61,62],get_batch_gener:[7,8,9,13,14,16],get_child_logg:17,get_class:[20,21,24,26],get_config:21,get_delimiter_regex:11,get_figur:54,get_input_at:21,get_input_mask_at:21,get_input_shape_at:21,get_logg:17,get_losses_for:21,get_memory_s:46,get_output_at:21,get_output_mask_at:21,get_output_shape_at:21,get_paramet:[20,21,24,26],get_structured_result:[1,57],get_updates_for:21,get_weight:21,gini:29,gini_impur:[29,55,60],git:56,github:[1,3,21,36,56,57,59,61,62],give:[3,24,59],given:[1,2,3,7,9,10,11,13,14,17,19,20,21,23,24,26,30,40,46,51,55,56,57,59,60,61,62],global:[22,55,60],global_stat:[55,59,60,61,62],glove:[1,21],go:57,godbol:22,good:[3,61],govern:21,grab:60,gradient:21,gradienttap:21,graph:[2,5,21,47],greater:[1,21],green:3,ground:53,group:[11,60],guess:57,guid:22,ha:[2,3,7,9,13,14,21,22,24,26,31,38,46,57,59,60,61,62],half:59,hall:3,halv:59,handl:[2,11,21,59],happen:3,harmon:21,hasattr:21,hash:[2,55],hash_or_kei:[2,55],have:[2,3,11,13,19,20,21,23,24,26,31,46,51,55,56,57,59,60],head:[1,3,55,57,59,60],header:[4,9,11,14,53,59,61],heard:57,help:[2,19,20,21,23,24,26,57],helper:[5,27,46],henc:[16,21],here:[3,20,21,46,57,59,61],high:61,higher:[21,24],histogram:[5,27,41,48,54,55,59,60,61],histogram_and_quantil:[41,59,60],histogramopt:41,homebrew:56,honeypot:[1,57],honeypot_intentially_mislabeled_fil:[3,59],host:3,how:[2,3,9,20,21,29,51,54,57,60],howev:[2,3,59,60,61],html:3,http:[3,4,21,36,46,56],human:57,i:[2,3,19,21,23,29,57,61,62],id:57,id_count:11,idea:46,ideal:3,idempot:21,ident:2,identif:[11,26],identifi:[4,26,38,40,51,55,57,60,61,62],ideolog:22,idx:11,ie:60,iff:46,ignor:[11,21,22],ignore_dict:11,illustr:[2,3],iloc:59,imbal:22,impact:[26,30,40],implement:[21,53],impli:21,importerror:3,improp:41,improv:57,impur:29,inbound:21,inbound_nod:21,includ:[1,19,21,22,23,25,41,59,61,62],incom:21,incompat:21,incorrect:[3,29],incorrectli:[3,59],increas:61,indent:[55,59,60,61,62],index:[1,2,4,11,19,20,21,23,25,26,39,40,46,60,61],indic:[1,19,20,21,22,23,25,26,40,46],individu:[3,11,48,56,57,59],infer:[21,59,60,61],info:[7,8,9,13,14,16,17,57],inform:[2,3,21,25,57,59,61,62],ingest:[2,61,62],inherit:[1,57],initi:[1,2,11,19,20,21,23,24,26,28,29,31,32,33,37,38,39,41,42,44,45,46],inplac:[24,55,60],input:[1,2,3,7,8,9,10,11,12,13,14,15,16,19,20,21,22,23,24,25,41,46,48,57,59,61,62],input_file_path:[3,4,7,8,9,10,13,14,15,16],input_length:1,input_mask:21,input_shap:[1,21],input_signatur:21,input_spec:21,input_str:[1,26],inputspec:21,insert:[1,3,21,46,55,57,59,61,62],insid:[21,25,61,62],inspir:46,instal:[54,55],instanc:[17,19,21,22,23,29,60],instanti:[21,40,51],instead:[7,8,9,13,14,15,16,19,21,22,23,46,57],insturl:57,int_column_profil:37,intcolumn:[37,48],integ:[1,2,21,24,26,33,37,38,42,55,57,60],integr:53,intent:3,intention:3,intentionally_mislabled_fil:[3,59],interchang:57,interept:3,interest:[1,59],intern:[21,22,46],interpret:[2,3],intopt:[37,41],intro:[53,60],introductori:59,invalid:1,investig:57,involv:57,io:3,ipv4:[2,55],ipv6:[2,55],iri:[3,61],is_case_sensit:[41,60,62],is_en:[1,41,55,57,59,60,61,62],is_float:[33,37,38,42],is_in_list:51,is_in_rang:51,is_int:[33,37,38,42],is_match:[7,8,9,13,14,16,29,53],is_numeric_stats_en:[41,60],is_prop_en:41,is_separate_at_max_len:24,is_stream_buff:11,is_structur:[7,8,9,13,14,16],is_valid_url:11,isinst:1,isn:21,issu:55,item:[1,11,46,51,57],iter:[2,11,19,20,26,46],its:[1,2,3,19,21,22,23,38,40,51,55,60],itself:[4,29,32,33,37,38,39,42],ivar:41,j:[3,29],javamail:57,john:57,join:[1,3,7,13,24,35,59,61,62],jsmith:57,json:[1,2,3,4,5,6,11,14,40,54,55,57,59,60,61,62],json_data:[7,10,13,59],json_fil:[3,59],json_lin:11,json_to_datafram:11,jsondata:[3,7,10,13,59],jupyt:59,just:[4,59,60,61,62],k:[29,46],kader:29,keep:[21,57],kei:[4,7,11,13,19,23,24,35,40,46,55,60,61,62],kera:[1,21],kernel:21,kernel_initi:21,keydict:46,keyerror:46,keyword:21,kind:21,knowledg:22,known:22,kurtosi:[33,37,38,41,42,46,55,60],kwarg:[10,20,21,24,26],l211:21,l283:21,l:21,label1:2,label2:2,label:[0,1,5,20,21,22,24,26,27,40,41,53,56,58,59,60,61,62],label_1:26,label_1_pattern_1:26,label_1_pattern_2:26,label_2:26,label_2_pattern_1:26,label_2_pattern_2:26,label_encod:44,label_map:[1,19,20,21,23,24,26,57],label_nam:[2,25],label_represent:[31,60],labeler_class:23,labeler_typ:[1,2,23,57],labeler_util:25,labler:[55,56],lack:46,lambda:[1,21],languag:21,larg:59,last:[3,11,46,54,57,59,60,61],later:[3,21,55,61,62],latter:59,law:21,layer:[1,21],layer_a:21,layer_b:21,lazi:46,learn:[53,55],least:[13,41],left:60,leftov:24,len:[1,3,21,57,59,61,62],length:[3,7,8,9,11,13,14,16,24,46],less:[11,46,60],let:[1,57,59,61,62],level:[2,5,17,18,20,24,26,35,46,53,54,55,57,59,60,61],lib:36,librari:[1,2,19,23,24,53,55,57,59],libsnappi:56,licens:21,lifo:46,like:[1,2,21,22,35,46,59,60,61,62],likelihood:29,limit:21,line:[3,9,11,13,55,57,59],linux:56,list:[1,2,3,4,7,9,11,13,14,18,19,20,21,22,23,24,25,26,33,37,38,40,41,42,46,48,51,54,55,57,59,60,61,62],list_of_necessary_param:1,littl:21,live:21,ll:57,load:[1,4,7,8,9,11,12,13,14,15,16,18,19,20,21,23,24,26,40,41,53],load_as_str_from_fil:11,load_from_disk:[2,19,20,21,23,24,26],load_from_librari:[18,19,23,24],load_opt:[19,23],load_with_compon:[19,23],loaded_profil:[60,61,62],loc:11,local:56,locat:[3,9,19,21,23,40,59,60,61],log:[5,21,57,58],logger:[17,21],logic:21,look:[53,57,61,62],loop:23,loss:[1,21],low:54,lowest:24,lstm:53,m:[2,21,29,46,56],mac_address:[2,55],machin:[59,61,62],macklemor:2,maco:56,macro:[21,22,25],made:22,mai:[3,4,7,9,13,14,16,21,41,59,61,62],mail:57,main:[0,1,2,22,54],maintain:2,major:[22,57],make:[1,2,21,55,57,59],makedir:[1,57],manag:12,mani:[3,9,20],manipul:[61,62],manner:[3,55,60],manual:[3,21],map:[1,2,20,21,24,26,40,57],margin_of_error:[55,61],mari:57,mask:21,match:[19,21,22,23,26,32,33,37,42,61,62],match_sentence_length:24,math_op:21,mathemat:3,matmul:21,matplotlib:59,matric:46,matrix1:46,matrix2:46,matrix:[1,21,22,25,46,55],max:[21,32,41,46,55,59,60,61],max_byt:11,max_char_encoding_id:[1,21],max_k_mod:41,max_length:[1,21,24,26,57],max_lin:11,max_num_char:26,max_pool_s:46,max_sample_s:[41,60,61],maximum:[11,24,60],mayb:8,mb:55,mcm:22,mcm_:22,md5:[2,55],mean:[21,22,33,37,38,42,54,55,56,57,59,60,61,62],measur:46,mechan:[2,3],meet:3,melt:[1,57],memori:[1,4,7,9,13,14,16,46],memory_s:[55,60],mention:[3,57],merg:[46,53],merge_dct:46,messag:[46,57],met:60,metadata:[4,7,13],meth:46,method:[2,3,20,21,22,24,28,38,41,46,59,60,61,62],method_timeit:46,metric:[1,21,22],metric_1:21,metric_2:21,micro:[1,21,22,25],might:57,mime:57,min:[21,32,40,41,55,59,60,61],min_sample_s:40,min_true_sampl:[40,60],mind:57,mine:22,mini:21,minimum:[31,40,60],mirror:60,mismatch:[19,23,61],miss:46,mix:21,mixed_precis:21,mixin:[5,6,33,37,41],ml:[55,56,57],mod:21,mode:[12,21,24,33,37,38,41,42],model:[5,18,19,23,24,38,40,51,53,55,57],model_predict:2,model_result:[2,57],modeopt:41,modif:1,modifi:[21,24,59],modul:[17,21,22,58],monitor:55,monti:3,more:[3,8,21,22,24,31,53,55,57,59,61,62],most:[29,61,62],move:21,mro:[20,24],much:60,multi:[19,20,21,22,23,26],multiclass:22,multilabel:22,multilabel_confusion_matrix:22,multipl:[2,7,8,9,13,14,15,16,21,51,56,57,59,60,61,62],multiprocess:[30,40,46,60],multiproess:46,must:[1,2,4,17,19,20,21,23,24,31,60,61,62],my:[2,3,60],my_datafram:[55,59,60,61],my_label:57,my_metric_lay:21,my_modul:21,my_profil:[60,61,62],my_text:55,mydata:3,mylay:21,mymetriclay:21,mymodul:21,n:[1,2,3,11,24,26,29,57,59],n_dim:21,n_label:22,n_output:22,n_sampl:22,n_unique_label:22,na:57,name:[1,17,18,19,21,22,23,24,25,28,29,30,31,32,33,37,39,42,45,46,48,55,56,57,59,60,61],name_scop:21,nan:11,nation:[3,59],ndarrai:[19,20,21,22,24],ndim:21,nearest:24,nearli:2,necessari:1,need:[1,2,11,17,19,20,21,26,41,46,55,57,59,61,62],neg:[22,46,60],ner:[2,24,57],nest:[4,21,35,40,46,60,61,62],network:[1,21],neural:1,new_data:[55,60,61,62],new_label:[2,57],next:[1,3,57,61,62],nice:21,node:21,node_index:21,non:[1,21,24,40,57,60],non_csv_fil:3,non_trainable_vari:21,non_trainable_weight:21,none:[1,4,5,7,8,9,10,11,12,13,14,15,16,19,20,21,22,23,24,25,26,28,29,30,31,32,33,37,38,39,40,41,42,44,45,46,48,54,56,57,60,61,62],nor:21,normal:21,normal_csv_fil:60,normal_text_fil:60,note:[1,2,11,21,22,25,42,56,60,61,62],notebook:[1,3,57,59,61,62],noth:31,notic:[3,60],notset:17,nov1resourcemessagefilt:21,now:[21,57],np:[1,11,19,20,21,22,24],np_type_to_typ:[33,37,38,42],nth:11,null_count:[55,60,61],null_typ:[55,60,61],null_types_index:[55,60,61],null_valu:[40,41],num_class:[1,21],num_fil:21,num_label:[1,20,21,25,26],num_lin:3,num_neg:[41,55,60],num_quantile_group:[35,60],num_sampl:24,num_zero:[41,55,60],number:[1,2,11,19,20,21,22,23,24,40,41,46,51,52,55,57,59,60,61,62],numer:[5,21,27,33,37,41,54,60],numerical_column_stat:[33,37,38,42],numericalopt:[38,41],numericstatsmixin:[33,37,38,42],numpi:[1,11,21,24,33,36,37,38,42,46],o:57,object:[1,3,4,8,10,11,12,15,19,20,21,23,24,28,30,35,38,40,41,44,45,46,51,55,57,59,60,61,62],observ:29,obtain:[21,59],occur:[11,29,51,57,60],occurr:22,od:35,off:[41,55,59,60,61,62],offic:57,offset:12,often:[21,29],older:56,omit:25,omitted_label:25,on_read:21,onc:[1,3,61,62],one:[1,2,4,11,17,21,22,29,30,40,55,56,57,59,60,61,62],ones:[21,57,60],onli:[3,4,7,9,13,14,16,20,21,22,46,51,54,55,57,59,60,61,62],onto:46,op:21,opeid6:57,open:[3,12,55,57],open_method:12,oper:[21,55,60,61,62],opportun:24,optim:[1,21,60],option:[1,2,4,5,7,8,9,10,11,13,14,15,16,19,21,22,23,27,28,29,30,31,32,33,37,38,39,40,42,44,45,51,53,54,55,57],order:[1,2,5,20,21,22,24,26,27,29,31,41,46,55,57,61],order_column_profil:39,ordercolumn:39,orderopt:[39,41],ordin:[2,55],org:[21,46],origin:[3,11,21,24,57,59],os:[1,3,55,57,59,61,62],other:[1,21,30,40,55,57,59,60],other_profil:[28,29,31,32,33,37,38,39,40,42,44,45],otherwis:[21,22,24,31,41,46,60],ou:57,our:54,out:[1,2,46,57,60],outbound_nod:21,output:[1,2,4,11,19,21,22,23,24,25,31,32,39,40,44,45,46,57,60,61,62],output_dict:22,output_format:[2,24,40,55,57,59,60,61,62],output_mask:21,output_shap:[1,21],over:[3,19,51],overal:[59,61,62],overlap:46,overrid:[2,21,41],overview:53,overwrit:[41,57],own:[1,21,57,60],p:[29,56,60],packag:[6,52,55,56],pad:[1,24,25,57],pad_label:[1,24],page:60,pair:[19,23,46],panda:[1,4,9,24,28,29,30,31,32,33,37,38,39,40,42,45,46,51,53,54,57,59,61,62],param1:2,param2:2,param3:2,param:[1,19,20,21,23,24,25,26,33,40],param_list:[20,21,24,26],paramet:[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,28,29,30,31,32,33,35,37,38,39,40,41,42,44,45,46,48,51,57,60],parameter:21,parq:3,parquet:[3,4,5,6,11,54,55,56,59,60,61,62],parquet_data:[10,14,55,59,60],parquet_fil:[3,59],parquetdata:[3,10,14,55,59,60],part:[21,55],partial:51,particular:[1,11,59],particularli:[61,62],partit:46,pass:[2,4,7,9,11,13,14,16,21,48],path:[1,2,3,4,7,8,9,11,12,13,14,15,16,19,20,21,23,24,26,40,44,57,59,60,61,62],pattern:26,pattern_dict:26,payload:[4,13],payload_kei:[4,13],pb:1,pd:[1,3,11,19,20,23,40,54,55,57,59,60,61,62],pdt:57,peek:[61,62],per:[2,21,22,55,57],percent:13,percentag:44,percis:41,perform:[1,21,22,57],permiss:21,perri:29,person:[2,55,57],pertain:[4,7,8,9,13,14,15,16],phone:57,phone_numb:[2,55],pickl:[61,62],pii:55,pip3:56,pip:[54,55,56],pipelin:[2,19,23,53,55,57],pkl:[60,61,62],place:[21,24,40,60,61,62],plain:[3,57,59],pleas:[2,55],plethora:59,plot:48,plot_col_histogram:[48,54],plot_histogram:[48,54],plt:59,plug:1,pm:57,point:[4,22,55,61,62],polici:21,pool:[30,40,46],pop:46,popitem:46,pos_label:22,posit:[2,21,22,57],possibl:[2,4,7,9,13,14,16,21,26,55,60,61,62],post:3,postprocess:24,postprocess_char_level:[55,60,62],postprocessor:[1,18,19,23,24,57],potenti:[21,46],pp:[22,29],pre:[1,55],precis:[21,22,25,33,41,46,55,57,60,61],precision_recall_fscore_support:22,precisionopt:41,pred:[2,22,24,57],predict:[1,2,19,20,21,23,24,25,26,31,53,55,59,60,61,62],predict_opt:[2,19,23,57],predicted_entities_in_index:25,prefer:60,preivous:3,premium:3,prep:57,prepar:57,preprocess:24,preprocessor:[1,18,19,23,24,57],present:[1,22,46],pretti:[11,40,55,59,60,61,62],prettifi:[40,55,60,61,62],previou:60,previous:3,price:3,prim:28,primari:[61,62],primit:30,princip:21,print:[1,2,3,11,19,20,21,23,25,26,55,57,59,60,61,62],printout:25,prior:[11,61,62],prioriti:24,priority_ord:24,priority_predict:24,privat:28,privileg:57,probabl:[29,31,40,60,61],process:[1,2,5,11,18,20,46,51,59,60,61,62],processor:[2,18,19,23,24,57],processor_param:1,processor_typ:24,produc:21,profil:[0,1,2,3,5,6,35,38,46,48,53,54,56,57,58],profile1:[55,59,60,61,62],profile2:[55,59,60,61,62],profile3:[55,60,61,62],profile_build:40,profile_ful:59,profile_merg:59,profile_opt:[1,57,59,60,61,62],profile_schema:[55,60],profiler_opt:41,profiler_typ:[40,55,59,60,61,62],profileropt:[1,40,41,57,59,60,61,62],prop:41,propag:21,proper:10,properli:[11,41],properti:[3,4,7,8,9,13,14,16,19,20,21,23,26,28,29,30,31,32,33,37,38,39,40,41,42,44,45,59,60,61,62],proto:3,provid:[1,3,21,23,40,46,53,54,55,57,59,60],pst:57,purpos:[61,62],put:24,py3:56,py:[21,36,56],pypi:[55,56],pyplot:59,pytest:56,python3:56,python:[21,33,37,38,42,55,56],quadratur:3,quantil:[35,55,60,61],quantiti:[2,55],queri:11,quick:[57,61],quickli:[61,62],quot:[3,9],quotechar:[9,11,53],r:[12,26,56],rag:24,rais:[1,3,21,22,31,41,46,59],raise_error:41,random:[2,3,21,24,29,40,55,57,60],random_st:[24,57],randomli:24,rang:[1,21,51,61,62],rank:[21,31],rare:[55,60],rather:21,ratio:[1,29,32,33,37,41,42,59,60,61],raw:57,re:[40,57],reach:[3,60],read:[1,4,7,9,11,12,13,14,16,22,53,55,59,60,61,62],read_csv_df:11,read_in_str:11,read_json:11,read_json_df:11,read_parquet_df:11,read_text_as_list_of_str:11,readabl:[12,22,46,57],readable_report:[55,60],reader:[0,1,5,53,55,57,58,60],readlin:3,real:57,reason:[21,61,62],rec_dropout:1,recal:[21,22,25,57],receiv:[21,22],recipi:57,recognit:[2,55,56,57,60],reconstruct:[1,2,57],record:[3,4,7,9,13,14,21,46,62],record_samples_per_lin:[3,9],recurr:1,recurrent_activ:1,recurrent_dropout:1,recurs:[21,46],red:3,reduc:[1,46],reduce_max:21,reduce_mean:21,reduce_min:21,reduce_sum:21,redund:41,refer:[2,21,22,29],referenc:22,refit:19,reflect:21,regard:57,regardless:59,regex:[5,11,18,24,40],regex_model:[18,26],regex_pattern:26,regexmodel:[18,26],regexpostprocessor:[18,24],regist:[20,24],regular:21,reinstanti:51,reject:46,rel:11,relat:[36,59,61],relev:21,reload:[7,8,9,13,14,16,53],relu:1,remain:24,remov:[21,38,40,46,56,59,60,61,62],replac:[1,7,8,9,13,14,16,26,60],repo:56,report:[1,5,18,25,27,30,33,34,40,48,53,54,55,57,58,59],report_ful:59,report_help:35,report_merg:59,report_opt:[40,55,59,60,61,62],repres:[11,21,25,29,32,33,37,38,39,40,42],represent:[11,31],reproduc:60,request:3,requir:[1,2,3,21,24,26,40,54,55,56,57,60,61,62],requires_zero_map:[1,20,21,26],rerun:56,reserv:[1,21],reset:[1,19,20,21,26,55],reset_index:[1,57],reset_st:21,reset_weight:[19,20,21,26,57],resolut:[20,24],resolv:56,resourc:[21,46],respect:[38,40,51,57],respons:[4,60],rest:[3,21],result:[1,2,21,22,24,30,40,51,55,57,59,60,61],retrain:[2,57],retriev:[19,20,21,23,24,26,60],return_sequ:1,reus:[2,21],revers:[20,21,26],reverse_label_map:[19,20,21,23,26],review:[2,61],rice:[59,60],right:21,round:[22,40,60,61,62],row:[2,3,4,9,11,55,57,59,60,61],row_count:[55,59,60,61],row_has_null_ratio:[55,60,61],row_is_null_ratio:[55,59,60,61],rtype:[11,20,24,25,31,32,33,37,38,39,40,41,42,46],run:[2,20,21,41,51,56,57,59,61,62],runtim:[40,60,61],runtimeerror:21,s:[1,2,4,11,20,21,24,33,35,37,38,41,42,46,57,61,62],safe:21,sai:[31,55,56],said:[61,62],sake:[1,57],sale:57,same:[1,2,11,19,20,21,22,23,24,26,31,41,46,55,59,60,61,62],same_a:[2,19,20,21,23,26],sampl:[1,2,19,20,21,22,24,31,32,33,37,40,41,42,46,55,57,59,61,62],sample1:59,sample2:59,sample3:59,sample_arrai:60,sample_id:40,sample_in_chunk:46,sample_ratio:[41,60],sample_s:[29,40,55,59,60,61],sample_skew:46,sample_weight:[21,22],samples_per_lin:[4,16],samples_per_upd:[40,60],samples_us:[55,60,61,62],sampling_ratio:40,sarawagi:22,satisfi:60,save:[2,7,8,9,13,14,16,19,20,21,23,24,26,40,53],save_dirpath:[1,2,23,57],save_to_disk:[1,2,19,20,21,23,24,26,57],saw:57,scalar:21,scan:57,schema:[55,60,61,62],schooldatasmal:[57,62],scope:21,score:[1,21,22,25,57],scott:60,scratch:53,sdist:56,seaborn:[48,54],search:11,search_queri:11,search_str:57,second:[1,31,46,55,59],second_dict:46,section:[1,55,57,59,60,61,62],see:[21,22,41,51,55,56,59,60,61,62],seed:5,seek_offset:12,seek_whenc:12,seem:41,seen:31,select:[4,7,9,13,14,16,24,30,53,60,62],selected_column:[3,4,9,11,14],selected_data_typ:30,selected_kei:[4,7,13],self:[1,21,30,41],send:[19,41,57,60],sensit:[22,41,55,56,59,60],sent:[1,19,57,60],sentenc:[3,24,25,62],separ:[11,24,35,59],seper:[35,61,62],sequenc:21,sequenti:[1,59],seri:[11,19,20,29,30,31,32,33,37,38,39,40,42,45,46,54,55,61,62],serial:21,serializ:[2,21,40,60,61,62],serv:1,set:[1,2,3,4,5,7,11,14,17,19,20,21,22,23,24,26,41,46,51,55,57,58,59,61,62],set_label:[19,23],set_label_map:[20,21,26],set_model:[1,2,19,23,57],set_param:[1,2,19,20,21,23,24,26,57],set_postprocessor:[2,19,23,57],set_preprocessor:[2,19,23,57],set_se:[5,60],set_verbos:[17,57],set_weight:21,setdefault:[1,46],setter:41,setup:56,sever:[37,38,59,60],sha1:[2,55],sha256:[2,55],shallow:46,shantanu:22,shape:[1,21,22,24],share:[46,60],sheet:3,shorten:[40,59,60,61,62],should:[3,21,26,51,54,55,59,60,61,62],show:[1,22,54,57,59,60],show_confid:[2,19,20,21,23,26,57],shown:[3,21,22,59,60],shuffl:46,shuffle_in_chunk:46,si1:3,si2:3,sigmoid:1,sign:46,signatur:21,signific:[33,60],simialrli:61,similar:[2,40,60,61,62],similarli:[21,61,62],simpl:[57,61,62],simpli:[21,31],simultan:2,sinc:[2,7,14,21,41,57],singl:[3,9,19,20,21,23,24,26,55,62],singlequot:3,size:[1,11,19,23,24,40,41,46,59],size_conv:[1,21],size_fc:[1,21],size_lstm:1,skew:[33,37,38,41,42,46,55,60],skip:[3,19,23,59,61],skip_postprocessor:[19,23],sklearn:[22,25],slightli:21,slimmer:[55,56],small:57,smith:57,snappy_compressed_intentionally_mislabeled_fil:3,so:[21,41,42,54],softmax:1,softmax_output_layer_nam:1,softwar:21,some:[1,21,54,57,59,60],someth:[3,57],sort:[21,22,55,60],sort_valu:[55,60],sourc:[11,21,56,57],source_fil:21,space:1,spaci:[2,57],spam:57,spars:[3,59,61],special:15,specif:[2,3,4,8,11,17,21,22,54,55,57,59,60,61,62],specifi:[1,2,4,7,8,9,10,13,14,15,16,19,21,22,23,24,35,40,41,46,48,51,53,54,59,61],split:[0,1,19,24,55,57,60],split_predict:24,split_ratio:[1,57],spreadsheet:[7,9,13,14,15],spreadsheetdata:[9,13,14],spreadsheetdatamixin:[9,13,14,15],sqrt:60,squar:22,src:3,ssmith:57,ssn:[2,4,55],stabil:21,standard:57,start:[2,12,24,26,51,57],stat1:46,stat2:46,stat:[5,27,28,33,37,41,46,55,60],state:[21,24,41],staticmethod:38,statist:[3,21,29,33,40,46,55,59,61,62],statu:[19,20,21,23,26],std:[55,61],stddev:[33,37,38,42,55,60,61],step:[1,21,56],still:[21,55],stop:[41,60,62],stop_word:[41,60,62],store:[20,40,46,61,62],str:[1,3,7,8,9,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,30,33,35,37,38,40,41,42,46,48,54,55,57,60,61,62],stream:[4,11,12,59],strength:22,strict:[55,56],string:[1,2,3,4,11,12,15,21,22,24,25,26,28,29,31,32,33,37,38,39,40,41,42,44,45,46,51,55,57,59,60,61,62],stringio:12,structcharpostprocessor:[2,18,24,57],structcharpreprocessor:[2,18,24,57],structur:[1,3,4,5,6,11,12,21,22,23,24,31,40,41,53,55],structured_mixin:[9,13,14,15],structured_model:18,structured_opt:[1,41,57,59,60,61],structured_profil:60,structured_report:60,structuredcol:40,structuredcolprofil:40,structureddatalabel:[18,23],structuredopt:[40,41],structuredprofil:[7,8,9,13,14,16,40,48],sturg:60,sub:21,subclass:[8,12,20,21,24,28,29,32,38,39,42],subject:[3,57],submodul:21,subsequ:[24,60],subset:[22,41,57],subtract:60,success:35,successfulli:59,sudo:56,suggeset:46,suggest:[46,55],suggest_pool_s:46,suggested_pool_s:46,suit:57,sum:[21,41,55,60],summari:[1,2,21,22,57,61,62],sunita:22,supplement:3,support:[4,21,22,25,57,61,62],supports_mask:21,sure:1,swap:[1,2],sy:[1,3,57,59,61,62],symbol:21,synchron:21,synthet:[61,62],system:60,t334:2,t555:2,t:[21,24,55,56,60,61,62],tabl:3,tabular:[1,2,57],take:[2,11,22,24,48,54,55,57,60,61,62],taken:25,tanh:1,target:22,target_nam:22,tcall:2,teach:29,techniqu:3,tensor:21,tensorflow:[1,21,46,55,56,57],tensorflow_addon:21,tensorshap:21,tensorspec:21,term:[1,2,61,62],terminolog:55,test:[1,3,7,9,11,13,14,33,37,38,42,57,59,60,61,62],test_csv_data:56,test_label:57,test_profile_build:56,testprofil:56,text:[1,2,3,4,5,6,9,18,21,22,24,25,27,38,41,55,57,59,60,61,62],text_column_profil:42,text_data:[10,16,55,59],text_fil:[3,55,59],text_sampl:59,textcolumn:42,textdata:[3,10,16,55,59],textiobas:12,textopt:[41,42],textprofil:45,textprofileropt:41,textrm:21,textual:21,tf1:21,tf2:21,tf:[1,21,57],tfood:2,than:[1,11,21,24,31,60],thei:[3,20,21,24,26,57,59,61,62],them:[40,54,55,57,59,60,61,62],themselv:21,theori:3,therefor:11,thereof:21,thi:[1,2,3,6,7,8,9,11,13,14,16,21,22,24,25,26,30,31,32,33,37,38,40,41,42,46,51,54,55,56,57,59,60,61,62],thing:21,think:57,those:[48,57],three:[57,60],threshold:[1,21,24],through:[4,11,21,29,57,59],thrown:1,thu:21,thyme:57,ti:2,tie:24,time:[2,11,21,22,46,51,55,61,62],timedelta:46,titan:3,titl:[48,54],tjohn:2,tneed:2,tnot:2,togeth:[19,21,23,24,46,55,60,61,62],toggl:[55,59,60],token:16,tolist:[1,57],too:[46,55,56],top:[29,31,41,46,60],top_k:31,top_k_categori:41,top_k_char:[41,60],top_k_mod:41,top_k_word:[41,60],topolog:21,total:[21,22],tpleas:2,trace:21,track:21,train:[1,19,20,21,23,53,55],train_data:[20,21],train_structured_label:[2,23,57],trainabl:[1,2,19,21,23,57],trainable_vari:21,trainable_weight:21,trainabledatalabel:[2,19],transfer:53,transform:21,travers:29,true_char_level:[55,60,62],true_entities_in_index:25,tssn:2,tsv:[3,55,61,62],tupl:[1,11,21,22,24,46],turn:[21,41,57,59,60,61,62],tutori:3,two:[2,3,21,24,31,32,39,40,44,45,46,55,59,60,61,62],txt:[1,3,55,56,59,60,61,62],typcial:46,type:[1,2,4,7,8,9,10,11,13,14,15,16,20,21,22,24,25,26,28,29,30,31,32,33,35,37,38,39,40,41,42,44,45,46,48,51,53,55,57,59,60],typeerror:21,typic:[4,21,57],typl:11,u:29,unalik:[29,55,60],unchang:[46,60],unclean:38,undefin:22,undefinedmetricwarn:22,under:21,underli:55,unicod:11,unicode_to_str:11,unik:29,union:[11,12,13,19,20,21,22,23,24,26,40,41,46,48],uniqu:[1,3,21,29,46,57,60,61,62],unique_count:[55,60,61],unique_ratio:[29,55,60,61],unique_row_ratio:[55,60,61],unit:[1,46,56],unittest:56,unknown:[1,2,24,25,55,57],unless:21,unlik:[21,29],unnest:35,unspecifi:60,unstructur:[3,5,23,24,27,40,41,53,61],unstructured_labeler_profil:44,unstructured_model:18,unstructured_opt:[41,60,62],unstructured_profil:60,unstructured_report:60,unstructured_text_profil:45,unstructuredcompil:30,unstructureddatalabel:[18,23],unstructuredlabelerprofil:44,unstructuredopt:41,unstructuredprofil:40,unstructuredtextopt:45,unstuctur:40,until:[24,60],unus:21,unweight:22,up:[11,53],updat:[1,2,21,22,28,29,30,31,32,33,37,38,39,40,41,42,44,45,46,53,54],update_column_profil:40,update_profil:[30,40,55,59,60,61,62],update_st:21,upon:[19,23,24,26],url:[2,11,55,57],url_as_str:11,url_to_byt:11,us:[1,2,3,7,9,11,12,13,14,16,18,19,20,21,22,23,24,26,31,40,41,54,55,56,57,59,60,61,62],us_stat:[2,55],usag:[20,24,53],use_word_level_argmax:[2,24,57],user:[1,2,3,4,7,9,13,14,16,19,20,21,22,23,26,40,55,57,59,60,61,62],userdata1:[3,59],userdata1_intentionally_mislabled_fil:[3,59],utf:[3,11,28,38,40,41,51,61],util:[2,3,5,6,18,27,30,40,55,57,60],uuid:[2,55],v0:21,v1:[36,57],v:[46,56],val:[33,37,38,42],val_data:[20,21],valid:[0,1,2,4,5,7,9,11,13,14,19,20,21,23,41,55,58,59,60],validation_split:[2,19,57],valu:[1,3,9,11,12,19,20,21,22,23,24,26,31,33,37,38,40,41,42,46,55,57,60,61,62],value1:2,value2:2,value3:2,value_label_df:1,valueerror:[1,21],vari:60,variabl:[21,29,33,37,38,40,41,42,48,54],variable_dtyp:21,variableaggreg:21,variablesynchron:21,varianc:[21,33,37,38,41,42,55,59,60,61],variat:29,variou:4,vartyp:41,vast:60,vector:21,venv3:56,verbos:[17,19,20,21,23,25,26],verifi:59,verify_ssl:4,version:[5,21,56,57,58,61,62],versu:22,via:[3,4,7,9,11,13,14,16,20,21,26,55,56,60,61,62],view:[1,3,46,57,59,61,62],virtual:[20,24,56],virtualenv:56,visit:55,vocab:[41,55,60,61,62],vocab_count:[55,60],vocabulari:1,vol:29,vote:[24,31],vs1:3,vs2:3,vs:[24,53],w:[2,21,26,57],wa:[3,21,24,57],wai:[21,29,57,60],walk:59,want:[3,11,20,21,26,41,54,55,56,59],warn:[17,19,21,22,23,41,46],warn_for:22,warn_on_profil:46,warranti:21,we:[1,3,11,21,29,41,48,57,59,60,61,62],weight:[1,2,19,20,21,22,25,26],welch:60,well:[2,11,21,40,59,60,61,62],went:3,were:[3,24,60],what:[2,3,11,46,53,61,62],when:[1,2,3,20,21,22,24,26,31,41,55,60,62],where:[1,2,11,19,20,21,23,24,26,29,40,60],wherea:21,whether:[3,11,19,20,21,23,24,26,41,59,60],which:[1,2,3,4,7,8,9,11,13,14,16,21,22,24,29,31,32,33,37,38,39,41,42,46,55,57,59,60,61,62],whl:56,whole:[20,21,26],whose:21,wide:[61,62],wiki:46,wikipedia:[22,46],wise:51,wish:21,with_name_scop:21,within:[3,11,19,23,24,26,31,41,60],without:[19,21,23,51],word:[24,41,55,57,60,62],word_count:[55,60,62],word_level:[24,55,60,62],word_level_min_perc:24,work:[1,21,41,54,55,59,60,62],worri:21,would:[1,2,21,57,60],wrap:[6,21],write:21,wrong:3,www:[21,46],x01:24,x1:46,x1b:3,x2:[46,56],x:[1,2,3,11,19,21,33,37,38,42,51,56,57],y1:46,y2:[46,56],y:[1,2,3,19,56,57],y_:22,y_pred:[21,22],y_true:[21,22],yate:46,yet:21,you:[2,3,11,20,21,26,41,51,54,55,56,57,59,60,61,62],you_websit:4,your:[3,21,57],your_data:2,your_fil:[2,4,55,60],z2:56,z:[3,56],zero:[1,21,60],zoo:[19,23]},titles:["API","Adding new model to the existing DataLabeler pipeline","Labeler (Sensitive Data)","Intro to Data Readers","Data Readers","Dataprofiler","Data Readers","Avro Data","Base Data","CSV Data","Data","Data Utils","Filepath Or Buffer","JSON Data","Parquet Data","Structured Mixins","Text Data","Dp Logging","Labelers","Base Data Labeler","Base Model","Character Level Cnn Model","Classification Report Utils","Data Labelers","Data Processing","Labeler Utils","Regex Model","Profilers","Base Column Profilers","Categorical Column Profile","Column Profile Compilers","Data Labeler Column Profile","Datetime Column Profile","Float Column Profile","Helpers","Report Helpers","Histogram Utils","Int Column Profile","Numerical Column Stats","Order Column Profile","Profile Builder","Profiler Options","Text Column Profile","Unstructured Data Labeler Column Profile","Unstructured Labeler Profile","Unstructured Text Profile","Utils","Reports","Graphs","Settings","Validators","Base Validators","Version","Examples","Graphs","Data Profiler | What\u2019s in your data?","Install","Sensitive Data Detection with the Labeler","dataprofiler","Data Profiler - What\u2019s in your data?","Profiler","Structured Profilers","Unstructured Profilers"],titleterms:{"class":[54,59],"float":33,"import":54,"int":37,"new":[1,2],A:3,Or:12,access:3,ad:1,after:3,alter:3,an:[2,54],api:0,attribut:3,automat:3,avro:7,avrodata:4,base:[8,19,20,28,51],basic:[53,59],buffer:12,build:[2,56,57],builder:40,categor:29,charact:[1,21],check:3,classif:22,cnn:21,column:[3,28,29,30,31,32,33,37,38,39,42,43],compil:30,compon:2,conclus:59,content:0,csv:9,csvdata:[3,4],data:[2,3,4,6,7,8,9,10,11,13,14,16,19,23,24,31,43,54,55,57,59,60],data_format:3,datafram:[3,55,60],datalabel:1,dataprofil:[5,58],dataset:1,datetim:32,deeper:3,delimit:[3,55,60],depend:60,detect:[3,57],differ:60,dive:3,dp:17,entiti:2,exampl:[53,54],exist:[1,2],extend:2,file:[3,55,60],filepath:12,filetyp:[55,60],floatcolumn:54,format:55,from:[54,56,57],get:55,graph:[48,54],ground:57,header:3,helper:[34,35],histogram:36,identifi:2,implement:1,individu:54,instal:56,intcolumn:54,integr:1,intro:3,is_match:3,json:13,jsondata:4,label:[2,18,19,23,25,31,43,44,55,57],learn:[2,57],level:[1,21],load:[2,3,55,57,60,61,62],log:17,lstm:1,merg:[55,59,60,61,62],mixin:15,model:[1,2,20,21,26],modul:[5,6,18,27,34,47,50],need:54,numer:38,option:[3,41,59,60,61,62],order:[39,60],own:2,panda:[3,55,60],parquet:14,parquetdata:4,pipelin:1,plot:54,postprocessor:2,predict:57,preprocessor:2,process:24,profil:[27,28,29,30,31,32,33,37,39,40,41,42,43,44,45,55,59,60,61,62],purpos:55,quotechar:3,read:3,reader:[3,4,6,59],regex:26,reload:3,report:[22,35,47,60,61,62],s:[55,59],sampl:60,save:[57,60,61,62],scratch:[56,57],seed:60,select:3,sensit:[2,57],set:[49,60],size:60,snappi:56,specifi:[3,55,60],start:55,stat:38,statist:60,structur:[2,15,57,59,60,61],structuredprofil:54,support:55,test:56,text:[16,42,45],textdata:4,train:[2,57],transfer:[2,57],type:[3,61,62],unstructur:[2,43,44,45,55,57,59,60,62],up:57,updat:[55,59,60,61,62],url:4,us:4,usag:59,util:[11,22,25,36,46],valid:[50,51],version:[52,55],vs:[59,60],we:54,what:[54,55,57,59],your:[2,54,55,59,60]}}) \ No newline at end of file diff --git a/docs/0.7.1/html/unstructured_profiler_example.html b/docs/0.7.1/html/unstructured_profiler_example.html new file mode 100644 index 000000000..1a812daab --- /dev/null +++ b/docs/0.7.1/html/unstructured_profiler_example.html @@ -0,0 +1,791 @@ + + + + + + + + + Unstructured Profilers - Data Profiler v0.7.1 + + + + + + + + + + + + + Contents + + + + + + + + + Menu + + + + + + + + Expand + + + + + + + + + + + + + +
+
+
+ +
+ +
+ +
+
+ +
+
+
+ + + +

View this notebook on GitHub

+
+

Unstructured Profilers

+

Data profiling - is the process of examining a dataset and collecting statistical or informational summaries about said dataset.

+

The Profiler class inside the DataProfiler is designed to generate data profiles via the Profiler class, which ingests either a Data class or a Pandas DataFrame.

+

Currently, the Data class supports loading the following file formats:

+
    +
  • Any delimited (CSV, TSV, etc.)

  • +
  • JSON object

  • +
  • Avro

  • +
  • Parquet

  • +
  • Text files

  • +
  • Pandas Series/Dataframe

  • +
+

Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.

+

This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler.

+
+

Reporting

+

One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.

+

In terms of reporting, there are multiple reporting options:

+
    +
  • Pretty: Floats are rounded to four decimal places, and lists are shortened.

  • +
  • Compact: Similar to pretty, but removes detailed statistics

  • +
  • Serializable: Output is json serializable and not prettified

  • +
  • Flat: Nested Output is returned as a flattened dictionary

  • +
+

The Pretty and Compact reports are the two most commonly used reports and includes global_stats and data_stats for the given dataset. global_stats contains overall properties of the data such as samples used and file encoding. data_stats contains specific properties and statistics for each text sample.

+

For unstructured profiles, the report looks like this:

+
"global_stats": {
+    "samples_used": int,
+    "empty_line_count": int,
+    "file_type": string,
+    "encoding": string
+},
+"data_stats": {
+    "data_label": {
+        "entity_counts": {
+            "word_level": dict(int),
+            "true_char_level": dict(int),
+            "postprocess_char_level": dict(int)
+        },
+        "times": dict(float)
+    },
+    "statistics": {
+        "vocab": list(char),
+        "words": list(string),
+        "word_count": dict(int),
+        "times": dict(float)
+    }
+}
+
+
+
+
[ ]:
+
+
+
+import os
+import sys
+import json
+sys.path.insert(0, '..')
+import dataprofiler as dp
+
+data_path = "../dataprofiler/tests/data"
+
+
+
+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "txt/discussion_reddit.txt"))
+profile = dp.Profiler(data)
+
+report  = profile.report(report_options={"output_format": "pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler Type

+

It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured.

+
+
[ ]:
+
+
+
+# run data profiler and get the report
+import pandas as pd
+data = dp.Data(os.path.join(data_path, "csv/SchoolDataSmall.csv"), options={"data_format": "records"})
+profile = dp.Profiler(data, profiler_type='unstructured')
+
+report  = profile.report(report_options={"output_format":"pretty"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Profiler options

+

The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the ProfilerOptions class.

+

For example, if a user doesn’t require vocab count information they may desire to turn off the word count functionality.

+

Below, let’s remove the vocab count and set the stop words.

+

Full list of options in the Profiler section of the DataProfiler documentation.

+
+
[ ]:
+
+
+
+data = dp.Data(os.path.join(data_path, "txt/discussion_reddit.txt"))
+
+profile_options = dp.ProfilerOptions()
+
+# Setting multiple options via set
+profile_options.set({ "*.vocab.is_enabled": False, "*.is_case_sensitive": True })
+
+# Set options via directly setting them
+profile_options.unstructured_options.text.stop_words = ["These", "are", "stop", "words"]
+
+profile = dp.Profiler(data, options=profile_options)
+report  = profile.report(report_options={"output_format": "compact"})
+
+# Print the report
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Updating Profiles

+

Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately.

+
+
[ ]:
+
+
+
+# Load and profile a CSV file
+data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile = dp.Profiler(data)
+
+# Update the profile with new data:
+new_data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile.update_profile(new_data)
+
+# Take a peek at the data
+print(data.data)
+print(new_data.data)
+
+# Report the compact version of the profile
+report  = profile.report(report_options={"output_format": "compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+

Merging Profiles

+

Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple + command: profile3 = profile1 + profile2

+
+
[ ]:
+
+
+
+# Load a CSV file with a schema
+data1 = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile1 = dp.Profiler(data1)
+
+# Load another CSV file with the same schema
+data2 = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+profile2 = dp.Profiler(data2)
+
+# Merge the profiles
+profile3 = profile1 + profile2
+
+# Report the compact version of the profile
+report  = profile3.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

As you can see, the update_profile function and the + operator function similarly. The reason the + operator is important is that it’s possible to save and load profiles, which we cover next.

+
+
+

Saving and Loading a Profile

+

Not only can the Profiler create and update profiles, it’s also possible to save, load then manipulate profiles.

+
+
[ ]:
+
+
+
+# Load data
+data = dp.Data(os.path.join(data_path, "txt/sentence-3x.txt"))
+
+# Generate a profile
+profile = dp.Profiler(data)
+
+# Save a profile to disk for later (saves as pickle file)
+profile.save(filepath="my_profile.pkl")
+
+# Load a profile from disk
+loaded_profile = dp.Profiler.load("my_profile.pkl")
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+

With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more.

+
+
[ ]:
+
+
+
+# Load a multiple files via the Data class
+filenames = ["txt/sentence-3x.txt",
+             "txt/sentence.txt"]
+data_objects = []
+for filename in filenames:
+    data_objects.append(dp.Data(os.path.join(data_path, filename)))
+
+print(data_objects)
+# Generate and save profiles
+for i in range(len(data_objects)):
+    profile = dp.Profiler(data_objects[i])
+    report = profile.report(report_options={"output_format":"compact"})
+    print(json.dumps(report, indent=4))
+    profile.save(filepath="data-"+str(i)+".pkl")
+
+
+# Load profiles and add them together
+profile = None
+for i in range(len(data_objects)):
+    if profile is None:
+        profile = dp.Profiler.load("data-"+str(i)+".pkl")
+    else:
+        profile += dp.Profiler.load("data-"+str(i)+".pkl")
+
+
+# Report the compact version of the profile
+report = profile.report(report_options={"output_format":"compact"})
+print(json.dumps(report, indent=4))
+
+
+
+
+
+ +
+ +
+ +
+
+ + + + + + + + + \ No newline at end of file diff --git a/docs/0.7.1/html/unstructured_profiler_example.ipynb b/docs/0.7.1/html/unstructured_profiler_example.ipynb new file mode 100644 index 000000000..e738cf3a7 --- /dev/null +++ b/docs/0.7.1/html/unstructured_profiler_example.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f37ca393", + "metadata": {}, + "source": [ + "# Unstructured Profilers" + ] + }, + { + "cell_type": "markdown", + "id": "ff9bd095", + "metadata": {}, + "source": [ + "**Data profiling** - *is the process of examining a dataset and collecting statistical or informational summaries about said dataset.*\n", + "\n", + "The Profiler class inside the DataProfiler is designed to generate *data profiles* via the Profiler class, which ingests either a Data class or a Pandas DataFrame. \n", + "\n", + "Currently, the Data class supports loading the following file formats:\n", + "\n", + "* Any delimited (CSV, TSV, etc.)\n", + "* JSON object\n", + "* Avro\n", + "* Parquet\n", + "* Text files\n", + "* Pandas Series/Dataframe\n", + "\n", + "Once the data is loaded, the Profiler can calculate statistics and predict the entities (via the Labeler) of every column (csv) or key-value (JSON) store as well as dataset wide information, such as the number of nulls, duplicates, etc.\n", + "\n", + "This example will look at specifically the unstructured data types for unstructured profiling. This means that only text files, lists of strings, single column pandas dataframes/series, or DataProfile Data objects in string format will work with the unstructured profiler. " + ] + }, + { + "cell_type": "markdown", + "id": "de58b9c4", + "metadata": {}, + "source": [ + "## Reporting" + ] + }, + { + "cell_type": "markdown", + "id": "8001185a", + "metadata": {}, + "source": [ + "One of the primary purposes of the Profiler are to quickly identify what is in the dataset. This can be useful for analyzing a dataset prior to use or determining which columns could be useful for a given purpose.\n", + "\n", + "In terms of reporting, there are multiple reporting options:\n", + "\n", + "* **Pretty**: Floats are rounded to four decimal places, and lists are shortened.\n", + "* **Compact**: Similar to pretty, but removes detailed statistics\n", + "* **Serializable**: Output is json serializable and not prettified\n", + "* **Flat**: Nested Output is returned as a flattened dictionary\n", + "\n", + "The **Pretty** and **Compact** reports are the two most commonly used reports and includes `global_stats` and `data_stats` for the given dataset. `global_stats` contains overall properties of the data such as samples used and file encoding. `data_stats` contains specific properties and statistics for each text sample.\n", + "\n", + "For unstructured profiles, the report looks like this:\n", + "\n", + "```\n", + "\"global_stats\": {\n", + " \"samples_used\": int,\n", + " \"empty_line_count\": int,\n", + " \"file_type\": string,\n", + " \"encoding\": string\n", + "},\n", + "\"data_stats\": {\n", + " \"data_label\": {\n", + " \"entity_counts\": {\n", + " \"word_level\": dict(int),\n", + " \"true_char_level\": dict(int),\n", + " \"postprocess_char_level\": dict(int)\n", + " },\n", + " \"times\": dict(float)\n", + " },\n", + " \"statistics\": {\n", + " \"vocab\": list(char),\n", + " \"words\": list(string),\n", + " \"word_count\": dict(int),\n", + " \"times\": dict(float)\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fcb5447", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "sys.path.insert(0, '..')\n", + "import dataprofiler as dp\n", + "\n", + "data_path = \"../dataprofiler/tests/data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7fc2df6", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "report = profile.report(report_options={\"output_format\": \"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "4d183992", + "metadata": {}, + "source": [ + "## Profiler Type" + ] + }, + { + "cell_type": "markdown", + "id": "d7ec39d2", + "metadata": {}, + "source": [ + "It should be noted, in addition to reading the input data from text files, DataProfiler allows the input data as a pandas dataframe, a pandas series, a list, and Data objects (when an unstructured format is selected) if the Profiler is explicitly chosen as unstructured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29737f25", + "metadata": {}, + "outputs": [], + "source": [ + "# run data profiler and get the report\n", + "import pandas as pd\n", + "data = dp.Data(os.path.join(data_path, \"csv/SchoolDataSmall.csv\"), options={\"data_format\": \"records\"})\n", + "profile = dp.Profiler(data, profiler_type='unstructured')\n", + "\n", + "report = profile.report(report_options={\"output_format\":\"pretty\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "fe02ad64", + "metadata": {}, + "source": [ + "## Profiler options" + ] + }, + { + "cell_type": "markdown", + "id": "40804cc9", + "metadata": {}, + "source": [ + "The DataProfiler has the ability to turn on and off components as needed. This is accomplished via the `ProfilerOptions` class.\n", + "\n", + "For example, if a user doesn't require vocab count information they may desire to turn off the word count functionality.\n", + "\n", + "Below, let's remove the vocab count and set the stop words. \n", + "\n", + "Full list of options in the Profiler section of the [DataProfiler documentation](https://capitalone.github.io/DataProfiler)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d25d899", + "metadata": {}, + "outputs": [], + "source": [ + "data = dp.Data(os.path.join(data_path, \"txt/discussion_reddit.txt\"))\n", + "\n", + "profile_options = dp.ProfilerOptions()\n", + "\n", + "# Setting multiple options via set\n", + "profile_options.set({ \"*.vocab.is_enabled\": False, \"*.is_case_sensitive\": True })\n", + "\n", + "# Set options via directly setting them\n", + "profile_options.unstructured_options.text.stop_words = [\"These\", \"are\", \"stop\", \"words\"]\n", + "\n", + "profile = dp.Profiler(data, options=profile_options)\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "\n", + "# Print the report\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "2052415a", + "metadata": {}, + "source": [ + "## Updating Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "7e02f746", + "metadata": {}, + "source": [ + "Beyond just profiling, one of the unique aspects of the DataProfiler is the ability to update the profiles. To update appropriately, the schema (columns / keys) must match appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab8022f", + "metadata": {}, + "outputs": [], + "source": [ + "# Load and profile a CSV file\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Update the profile with new data:\n", + "new_data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile.update_profile(new_data)\n", + "\n", + "# Take a peek at the data\n", + "print(data.data)\n", + "print(new_data.data)\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\": \"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "66ec6dc5", + "metadata": {}, + "source": [ + "## Merging Profiles" + ] + }, + { + "cell_type": "markdown", + "id": "e2265fe9", + "metadata": {}, + "source": [ + "Merging profiles are an alternative method for updating profiles. Particularly, multiple profiles can be generated seperately, then added together with a simple `+` command: `profile3 = profile1 + profile2`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc68ca07", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a CSV file with a schema\n", + "data1 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile1 = dp.Profiler(data1)\n", + "\n", + "# Load another CSV file with the same schema\n", + "data2 = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "profile2 = dp.Profiler(data2)\n", + "\n", + "# Merge the profiles\n", + "profile3 = profile1 + profile2\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile3.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "7ea07dc6", + "metadata": {}, + "source": [ + "As you can see, the `update_profile` function and the `+` operator function similarly. The reason the `+` operator is important is that it's possible to *save and load profiles*, which we cover next." + ] + }, + { + "cell_type": "markdown", + "id": "30868000", + "metadata": {}, + "source": [ + "## Saving and Loading a Profile" + ] + }, + { + "cell_type": "markdown", + "id": "f2858072", + "metadata": {}, + "source": [ + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ad9ca57", + "metadata": {}, + "outputs": [], + "source": [ + "# Load data\n", + "data = dp.Data(os.path.join(data_path, \"txt/sentence-3x.txt\"))\n", + "\n", + "# Generate a profile\n", + "profile = dp.Profiler(data)\n", + "\n", + "# Save a profile to disk for later (saves as pickle file)\n", + "profile.save(filepath=\"my_profile.pkl\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.pkl\")\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + }, + { + "cell_type": "markdown", + "id": "8f9859c2", + "metadata": {}, + "source": [ + "With the ability to save and load profiles, profiles can be generated via multiple machines then merged. Further, profiles can be stored and later used in applications such as change point detection, synthetic data generation, and more. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3571f2d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Load a multiple files via the Data class\n", + "filenames = [\"txt/sentence-3x.txt\",\n", + " \"txt/sentence.txt\"]\n", + "data_objects = []\n", + "for filename in filenames:\n", + " data_objects.append(dp.Data(os.path.join(data_path, filename)))\n", + "\n", + "print(data_objects)\n", + "# Generate and save profiles\n", + "for i in range(len(data_objects)):\n", + " profile = dp.Profiler(data_objects[i])\n", + " report = profile.report(report_options={\"output_format\":\"compact\"})\n", + " print(json.dumps(report, indent=4))\n", + " profile.save(filepath=\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Load profiles and add them together\n", + "profile = None\n", + "for i in range(len(data_objects)):\n", + " if profile is None:\n", + " profile = dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + " else:\n", + " profile += dp.Profiler.load(\"data-\"+str(i)+\".pkl\")\n", + "\n", + "\n", + "# Report the compact version of the profile\n", + "report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "print(json.dumps(report, indent=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/_static/images/histogram_example_0.png b/docs/source/_static/images/histogram_example_0.png new file mode 100644 index 000000000..9b8301363 Binary files /dev/null and b/docs/source/_static/images/histogram_example_0.png differ diff --git a/docs/source/_static/images/histogram_example_1.png b/docs/source/_static/images/histogram_example_1.png new file mode 100644 index 000000000..062dfdbb9 Binary files /dev/null and b/docs/source/_static/images/histogram_example_1.png differ diff --git a/docs/source/_static/images/histogram_example_2.png b/docs/source/_static/images/histogram_example_2.png new file mode 100644 index 000000000..1aedf7549 Binary files /dev/null and b/docs/source/_static/images/histogram_example_2.png differ diff --git a/docs/source/add_new_model_to_data_labeler.nblink b/docs/source/add_new_model_to_data_labeler.nblink index ab5111780..130e413fc 100644 --- a/docs/source/add_new_model_to_data_labeler.nblink +++ b/docs/source/add_new_model_to_data_labeler.nblink @@ -1,3 +1,3 @@ { - "path": "../../DataProfiler/examples/add_new_model_to_data_labeler.ipynb" + "path": "../../feature_branch/examples/add_new_model_to_data_labeler.ipynb" } \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 31ee8d4f7..8ab6a7959 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,8 +13,10 @@ import furo import os import sys -sys.path.insert(0, os.path.abspath('../../DataProfiler')) -sys.path.insert(0, os.path.abspath('../../DataProfiler/examples')) + +branch_name = "feature_branch" +sys.path.insert(0, os.path.abspath(f'../../{branch_name}')) +sys.path.insert(0, os.path.abspath(f'../../{branch_name}/examples')) # -- Project information ----------------------------------------------------- diff --git a/docs/source/data_reader.nblink b/docs/source/data_reader.nblink index 0cd127e4b..4722970da 100644 --- a/docs/source/data_reader.nblink +++ b/docs/source/data_reader.nblink @@ -1,3 +1,3 @@ { - "path": "../../DataProfiler/examples/data_readers.ipynb" + "path": "../../feature_branch/examples/data_readers.ipynb" } \ No newline at end of file diff --git a/docs/source/dataprofiler.reports.graphs.rst b/docs/source/dataprofiler.reports.graphs.rst index a1e839a20..3a7adf900 100644 --- a/docs/source/dataprofiler.reports.graphs.rst +++ b/docs/source/dataprofiler.reports.graphs.rst @@ -1,5 +1,5 @@ Graphs -======================================== +====== .. automodule:: dataprofiler.reports.graphs :members: diff --git a/docs/source/dataprofiler.reports.rst b/docs/source/dataprofiler.reports.rst index e12cd5f59..2b4c679c5 100644 --- a/docs/source/dataprofiler.reports.rst +++ b/docs/source/dataprofiler.reports.rst @@ -1,10 +1,14 @@ Reports -========= +======= Modules ------- +.. toctree:: + :maxdepth: 4 + + .. toctree:: :maxdepth: 4 diff --git a/docs/source/dataprofiler.rst b/docs/source/dataprofiler.rst index f2a881ef5..7a57e0427 100644 --- a/docs/source/dataprofiler.rst +++ b/docs/source/dataprofiler.rst @@ -11,6 +11,7 @@ Modules dataprofiler.data_readers dataprofiler.labelers dataprofiler.profilers + dataprofiler.reports dataprofiler.validators .. toctree:: diff --git a/docs/source/graphs.rst b/docs/source/graphs.rst index a12408c63..08d7a8063 100644 --- a/docs/source/graphs.rst +++ b/docs/source/graphs.rst @@ -1,10 +1,10 @@ .. _reports: Graphs -******** +****** Graph Your Data -================= +=============== We can plot some of our data as seaborn histogram plots. Below will demonstrate how to do so and provide examples. @@ -15,7 +15,7 @@ The following plots are currently available to work directly with your profilers Below shows how to do so with examples. What we need to import -~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from dataprofiler.reports import graphs @@ -27,7 +27,7 @@ The main functions that is used to plot histograms are in graphs. **You will als pip install 'dataprofiler[reports]' Plotting from a StructuredProfiler class -~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ With a StructuredProfiler class variable, we can specify what columns we want to be plotted, and plot them into histograms. @@ -41,7 +41,7 @@ These are what the variables mean: * **columns** - (Optional) The list of IntColumn or FloatColumn we want to specifically plot. Plotting an individual IntColumn or FloatColumn -~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Example uses a CSV file for example, but CSV, JSON, Avro or Parquet should also work. @@ -56,7 +56,7 @@ These are what the variables mean: * **title** - (Optional) The title of the plot we want to define. Examples -~~~~~~~~~~~~~~~~~ +~~~~~~~~ 1. This example demonstrates how we can take a StructuredProfiler class and plot histograms of the specified columns. @@ -64,6 +64,7 @@ Examples import dataprofiler as dp from dataprofiler.reports import graphs + data = [[1, 'a', 1.0], [2, 'b', 2.2], @@ -72,32 +73,40 @@ Examples profiler = dp.StructuredProfiler(data) # This will plot all IntColumn and FloatColumn as histograms (The first and last column). - graphs.plot_histograms(profiler) + fig = graphs.plot_histograms(profiler) + fig.show() # This will only plot the specified column, 0. columns = [0] - graphs.plot_histograms(profiler, columns) + fig = graphs.plot_histograms(profiler, columns) + fig.show() -.. image:: docs/source/_static/images/graph_0.svg - :alt: First Example Image +.. image:: _static/images/histogram_example_0.png + :alt: First Histogram Example Image -.. image:: docs/source/_static/images/graph_1.svg - :alt: Second Example Image +.. image:: _static/images/histogram_example_1.png + :alt: Second Histogram Example Image 2. This example demonstrates how we can plot a low level profiler. .. code-block:: python - import dataprofiler as dp + import pandas as pd + from dataprofiler.profilers import IntColumn from dataprofiler.reports import graphs + data = pd.Series([1, 2, 3], dtype=str) profiler = IntColumn('example') profiler.update(data) - # We will plot profiler - graphs.plot_col_histogram(profiler) + # Plot the axes + ax = graphs.plot_col_histogram(profiler) + + # get and show the figure of the plotted histogram + fig = ax.get_figure() + fig.show() -.. image:: docs/source/_static/images/graph_2.svg - :alt: Third Example Image \ No newline at end of file +.. image:: _static/images/histogram_example_2.png + :alt: Histogram Column Only Example Image diff --git a/docs/source/index.rst b/docs/source/index.rst index b9b3d0fb6..aaefa381b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -395,6 +395,7 @@ In addition, it utilizes only the first 10,000 rows. data_readers.rst profiler.rst data_labeling.rst + graphs.rst .. toctree:: :maxdepth: 2 @@ -419,6 +420,7 @@ In addition, it utilizes only the first 10,000 rows. Versions ======== +* `0.7.1`_ * `0.7.0`_ * `0.6.0`_ * `0.5.3`_ @@ -454,3 +456,5 @@ Versions .. _0.7.0: ../../0.7.0/html/index.html +.. _0.7.1: ../../0.7.1/html/index.html + diff --git a/docs/source/labeler.nblink b/docs/source/labeler.nblink index 51f6bf16b..bed6517bf 100644 --- a/docs/source/labeler.nblink +++ b/docs/source/labeler.nblink @@ -1,6 +1,6 @@ { - "path": "../../DataProfiler/examples/labeler.ipynb", + "path": "../../feature_branch/examples/labeler.ipynb", "extra-media": [ - "../../DataProfiler/examples/DL-Flowchart.png" + "../../feature_branch/examples/DL-Flowchart.png" ] } \ No newline at end of file diff --git a/docs/source/overview.nblink b/docs/source/overview.nblink index 89109b4a5..3d9f89d3d 100644 --- a/docs/source/overview.nblink +++ b/docs/source/overview.nblink @@ -1,3 +1,3 @@ { - "path": "../../DataProfiler/examples/intro_data_profiler.ipynb" + "path": "../../feature_branch/examples/intro_data_profiler.ipynb" } \ No newline at end of file diff --git a/docs/source/profiler_example.nblink b/docs/source/profiler_example.nblink index 6f79f2b8c..8b1612784 100644 --- a/docs/source/profiler_example.nblink +++ b/docs/source/profiler_example.nblink @@ -1,3 +1,3 @@ { - "path": "../../DataProfiler/examples/structured_profilers.ipynb" + "path": "../../feature_branch/examples/structured_profilers.ipynb" } \ No newline at end of file diff --git a/docs/source/unstructured_profiler_example.nblink b/docs/source/unstructured_profiler_example.nblink index f1df89fc0..1589c41d4 100644 --- a/docs/source/unstructured_profiler_example.nblink +++ b/docs/source/unstructured_profiler_example.nblink @@ -1,3 +1,3 @@ { - "path": "../../DataProfiler/examples/unstructured_profilers.ipynb" + "path": "../../feature_branch/examples/unstructured_profilers.ipynb" } \ No newline at end of file diff --git a/docs/update_documentation.py b/docs/update_documentation.py index 03ffa4e9a..18db3eb04 100644 --- a/docs/update_documentation.py +++ b/docs/update_documentation.py @@ -3,7 +3,7 @@ import subprocess import os -branch_folder = "DataProfiler" +branch_folder = "feature_branch" sys.path.insert(0, os.path.abspath(f'../{branch_folder}')) from dataprofiler import __version__ as version # noqa F401 diff --git a/index.html b/index.html index 439fceb41..cf0e00a93 100644 --- a/index.html +++ b/index.html @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file