initial commit

julia-kraus · Feb 26, 2018 · 384d226 · 384d226
1 parent 4f008f7
commit 384d226
Show file tree

Hide file tree

Showing 14 changed files with 941 additions and 0 deletions.
diff --git a/.ipynb_checkpoints/chatbot_model-checkpoint.ipynb b/.ipynb_checkpoints/chatbot_model-checkpoint.ipynb
@@ -0,0 +1,384 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Program Files (x86)\\Microsoft Visual Studio\\Shared\\Anaconda3_64\\lib\\site-packages\\h5py\\__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "curses is not supported on this machine (please install/reinstall curses for an optimal experience)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# things we need for NLP\n",
+    "import nltk\n",
+    "from nltk.stem.lancaster import LancasterStemmer\n",
+    "stemmer = LancasterStemmer()\n",
+    "\n",
+    "# things we need for TensorFlow\n",
+    "import numpy as np\n",
+    "import tflearn\n",
+    "import tensorflow as tf\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'intents': [{'tag': 'greeting', 'patterns': ['Hi', 'How are you', 'Is anyone there?', 'Hello', 'Good day'], 'responses': ['Hello, thanks for visiting', 'Good to see you again', 'Hi there']}, {'tag': 'goodbye', 'patterns': ['Bye', 'See you later', 'Goodbye'], 'responses': ['See you later', 'Have a nice day', 'Bye! Come back again soon.']}, {'tag': 'thanks', 'patterns': ['Thanks', 'Thank you', \"That's helpful\"], 'responses': ['Happy to help!', 'Any time!', 'My pleasure']}, {'tag': 'med', 'patterns': [\"I'm looking for cheap meds\", 'want to find a deal', 'where are the cheapest meds', 'where can I buy meds for less money'], 'responses': [\"What's the name of the medication?\"], 'context': 'getRx'}, {'tag': 'handleRx', 'patterns': [], 'responses': [], 'context': 'handleRx'}, {'tag': 'coupon', 'patterns': ['what is the coupon', 'send me the coupon'], 'responses': ['Sorry, no coupon', 'No coupon available'], 'context': 'coupon'}]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# import our chat-bot intents file\n",
+    "import json\n",
+    "with open('intents1.json') as json_data:\n",
+    "    intents = json.load(json_data)\n",
+    "    \n",
+    "print(intents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With our intents JSON file loaded, we can now begin to organize our documents, words and classification classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "words = []\n",
+    "classes = []\n",
+    "documents = []\n",
+    "ignore_words = ['?']\n",
+    "\n",
+    "# loop through each sentence in our intents patterns\n",
+    "for intent in intents['intents']:\n",
+    "    for pattern in intent['patterns']:\n",
+    "        # tokenize each word in the sentence\n",
+    "        w = nltk.word_tokenize(pattern)\n",
+    "        # add to our word list\n",
+    "        words.extend(w)\n",
+    "        # add to documents in our corpus\n",
+    "        documents.append((w, intent['tag']))\n",
+    "        # add to our classes list\n",
+    "        if intent['tag'] not in classes:\n",
+    "            classes.append(intent['tag'])\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "17 documents\n",
+      "5 classes ['coupon', 'goodbye', 'greeting', 'med', 'thanks']\n",
+      "40 unique stemmed words [\"'m\", \"'s\", 'a', 'anyon', 'ar', 'buy', 'bye', 'can', 'cheap', 'cheapest', 'coupon', 'day', 'deal', 'find', 'for', 'good', 'goodby', 'hello', 'help', 'hi', 'how', 'i', 'is', 'lat', 'less', 'look', 'me', 'med', 'money', 'see', 'send', 'thank', 'that', 'the', 'ther', 'to', 'want', 'what', 'wher', 'you']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# stem and lower each word and remove duplicates. set removes duplicates, then turn back to list\n",
+    "words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n",
+    "words = sorted(list(set(words)))\n",
+    "\n",
+    "# remove duplicates\n",
+    "classes = sorted(list(set(classes)))\n",
+    "\n",
+    "print(len(documents), \"documents\")\n",
+    "print(len(classes), \"classes\", classes)\n",
+    "print(len(words), \"unique stemmed words\", words)\n",
+    "# words is our word lexicon"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We create a list of documents (sentences), each sentence is a list of stemmed words and each document is associated with an intent (a class). The stem 'tak' will match 'take', 'taking', 'takers', etc. We could clearn the words list and remove useless entries but this will suffice for now. \n",
+    "\n",
+    "Unfortunately, this data structure won't work with TensorFlow, we need to transform it further: *from documents of words* into *tensors of numbers*."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "family = [0] * 3\n",
+    "names = ['Katharina', 'Julia', 'Christiane']\n",
+    "names.index('Christiane')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Python List index() method: The method __index()__ returns the lowest inde in list that obj appears."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create our training data\n",
+    "training = []\n",
+    "output = []\n",
+    "# create an empty array for our output\n",
+    "output_empty = [0] * len(classes)\n",
+    "\n",
+    "# training set, bag of words for each sentence\n",
+    "for doc in documents:\n",
+    "    # initialize our bag of words\n",
+    "    bag = []\n",
+    "    # list of tokenized words for the pattern\n",
+    "    pattern_words = doc[0]\n",
+    "    # stem each word\n",
+    "    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]\n",
+    "    # create our bag of words array\n",
+    "    for w in words:\n",
+    "        bag.append(1) if w in pattern_words else bag.append(0)\n",
+    "        \n",
+    "    # output is a '0' for each tag and '1' for current tag\n",
+    "    # list is not necessary because output_empty is already a list\n",
+    "    # index function: \n",
+    "    output_row = list(output_empty)\n",
+    "    output_row[classes.index(doc[1])] = 1\n",
+    "    training.append([bag, output_row])\n",
+    "    \n",
+    "# shuffle our features and turn into np.array\n",
+    "random.shuffle(training)\n",
+    "training = np.array(training)\n",
+    "\n",
+    "# create train and test lists\n",
+    "train_x = list(training[:, 0])\n",
+    "train_y = list(training[:, 1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that our data is shuffled. TensorFlow will take some of this and use it as test data to *gauge accuracy for a newly fitted model*.\n",
+    "\n",
+    "If we look at a single x and y list element, we see 'bag of words' arrays, one for the intent pattern, the other for the intent class. We're ready to build our model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training Step: 2999  | total loss: \u001b[1m\u001b[32m0.01249\u001b[0m\u001b[0m | time: 0.014s\n",
+      "| Adam | epoch: 1000 | loss: 0.01249 - acc: 0.9999 -- iter: 16/17\n",
+      "Training Step: 3000  | total loss: \u001b[1m\u001b[32m0.01202\u001b[0m\u001b[0m | time: 0.018s\n",
+      "| Adam | epoch: 1000 | loss: 0.01202 - acc: 0.9999 -- iter: 17/17\n",
+      "--\n",
+      "INFO:tensorflow:C:\\Users\\kraus_ju\\Documents\\chatbot\\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# reset underlying graph data\n",
+    "tf.reset_default_graph()\n",
+    "# Build neural network\n",
+    "net = tflearn.input_data(shape=[None, len(train_x[0])])\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n",
+    "net = tflearn.regression(net)\n",
+    "\n",
+    "# Define model and setup tensorboard\n",
+    "model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')\n",
+    "# Start training (apply gradient descent algorithm)\n",
+    "model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)\n",
+    "model.save('model.tflearn')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To complete this section of work, we'll save ('pickle') our model and documents so the next notebook can use them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# save all of our data structures\n",
+    "import pickle\n",
+    "pickle.dump({'words': words, 'classes': classes, 'train_x': train_x, 'train_y': train_y},\n",
+    "           open(\"training_data\", \"wb\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Building Our Chatbot Framework\n",
+    "We'll build a simple state-machine to handle responses, using our intents model (from the previous step) as our classifier. That's how chatbots work. After loading the same imports, we'll *un-pickle* our model and documents as well as reload our intents file. Remember our chatbot framework is separate from our model build - you don't need to rebuild your model unless the intent pattern changes. With several hundred intents and thousands of patterns the model could take several minutes to build."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# things we need for NLP\n",
+    "import nltk\n",
+    "from nltk.stem.lancaster import LancasterStemmer\n",
+    "stemmer = LancasterStemmer()\n",
+    "\n",
+    "# things we need for Tensorflow\n",
+    "import numpy as np\n",
+    "import tflearn\n",
+    "import tensorflow as tf\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# restore all of our data structures\n",
+    "import pickle\n",
+    "data = pickle.load( open( \"training_data\", \"rb\" ) )\n",
+    "words = data['words']\n",
+    "classes = data['classes']\n",
+    "train_x = data['train_x']\n",
+    "train_y = data['train_y']\n",
+    "\n",
+    "# import our chat-bot intents file\n",
+    "import json\n",
+    "with open('intents1.json') as json_data:\n",
+    "    intents = json.load(json_data)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Build neural network\n",
+    "net = tflearn.input_data(shape=[None, len(train_x[0])])\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, 8)\n",
+    "net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')\n",
+    "net = tflearn.regression(net)\n",
+    "\n",
+    "# Define model and setup tensorboard\n",
+    "model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Before we can begin processing intents, we need a way to produce a bag-of-words from *user input*. This is the same technique as we used earlier to create our training documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# this is now for new data without labels! Bad: duplicate code --> refactor\n",
+    "\n",
+    "def clean_up_sentence(sentence):\n",
+    "    # tokenize the pattern\n",
+    "    sentence_words = nltk.word_tokenize(sentence)\n",
+    "    # stem each word\n",
+    "    sentence_words = [stemmer.stem(word.lower()) for word in sentence]\n",
+    "    return sentence_words\n",
+    "\n",
+    "# return bag of words array: 0 or 1 for each word i the bag that exists in the sentence\n",
+    "def bow(sentence, words, show_details=False):\n",
+    "    # tokenize the pattern\n",
+    "    sentence_words = clean_up_sentence(sentence)\n",
+    "    # bag of words"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}