diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4c3fa87 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ + +FROM python:3.8-slim-buster + +# Install the security updates. +RUN apt-get update +RUN apt-get -y upgrade + +# Dependencies to build requires packages +RUN apt-get -y install gcc + +# Remove all cached file. Get a smaller image. +RUN apt-get clean +RUN rm -rf /var/lib/apt/lists/* + +EXPOSE 3978 + +# Copy the application. +COPY . /opt/app +WORKDIR /opt/app + +# Install the app librairies. +RUN pip install -r requirements.txt + +# Install SpaCy small model +RUN python -m spacy download en_core_web_sm + +# Start the app. +ENTRYPOINT [ "python" ] +CMD [ "main.py" ] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..69d5e44 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Joffrey Bienvenu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..63a9fee --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +# chatbot + + + + + +## Cross-plateforme implementation + + +## fonctionnalités du Bot : + +### Traitement des inputs "utilisateurs" + +- Bert ? + +### Possible réponses + +- Accueillire + - Décrire les fonctionnalités du bot + - Répondre +- Donner les heures d'ouvertures +- Afficher une liste d'objet, de produit + +### Nice to have + +- Réserver un service / un produit + - Gérer un agenda + - vérifier la disponibilité + - +- + + "**In English**, the bot should be able to :\n", + "\n", + "- Understand phrases related to a room reservation.\n", + "Example that the bot will have to understand: \n", + "\n", + "\t\t> I wish to reserve a room for 2 people.\n", + "\t\t> I wish to reserve a room for 4 days\n", + "\t\t> Do you have rooms available from July 23rd?\n", + "\t\t> I would like to reserve a room for two days and for two people\n", + "\n", + "- Understand phrases related to a table reservation for the restaurant. \n", + "\n", + "\t\t> I would like to make a reservation for tonight.\n", + "\t\t> I'd like to reserve a table for four people.\n", + "\n", + "- Must ensure a continuous and ongoing conversation. Example of a complete conversation : \n", + "\n", + "\t\t> Customer : Hello !\n", + "\t\t> Bot : Hello, how can I help you? \n", + "\t\t> Customer: I would like to reserve a table for 4 people ? \n", + "\t\t> Bot : For which date would you like to reserve your table?\n", + "\t\t> Customer : Today at 7:00 pm\n", + "\t\t> Bot : What name should I make the reservation under?\n", + "\t\t> Customer : My name is Mr. Dupont! \n", + "\t\t> Bot : Very well Mr Dupont, I confirm you the reservation of a table for 4 people tonight at 7:00 pm. \n", + "\t\t> Bot : Can I help you with something else?\n", + "\t\t> Customer : No thanks\n", + "\t\t> Bot: Have a nice day. \n", + "\n", + "- Understand when the client is angry. In this case, the bot will indicate that it is transmitting the conversation to a human. \n", + "\n", + "\t\t> You're incompetent!\n", + "\t\t> My room is dirty! This is outrageous!\n", + "\t\t> I want to talk to a human. \n", + "\n", + "### Nice-to-have features\n", + "- Create an API of your bot to make it cross-platform \n", + "- Use Docker\n", + + + + +## Hébergement du Bot + +Timeline: +- Etablir l'objectif (déployer bot cross-plateforme + créer propre modele) +- Trouver un framework >> MSBotFramework +- Créer un dataset +- Deployer dummy bot \ No newline at end of file diff --git a/assets/conversation_simple.svg b/assets/conversation_simple.svg new file mode 100644 index 0000000..4bc7af6 --- /dev/null +++ b/assets/conversation_simple.svg @@ -0,0 +1,3 @@ + + +Bonjour, je voudrais réserver une chambreBonjour, je voudrais réserver une chambreBonjour, combien de personnes ?Bonjour, combien de personnes ?33Combien de nuits ?Combien de nuits ?22Reservation effectuée, bonne journée !Reservation effectuée, bonne journée !Viewer does not support full SVG 1.1 \ No newline at end of file diff --git a/assets/images/profile_large.png b/assets/images/profile_large.png new file mode 100644 index 0000000..d1f5247 Binary files /dev/null and b/assets/images/profile_large.png differ diff --git a/assets/images/profile_small.png b/assets/images/profile_small.png new file mode 100644 index 0000000..d86ebfb Binary files /dev/null and b/assets/images/profile_small.png differ diff --git a/assets/model/.empty b/assets/model/.empty new file mode 100644 index 0000000..e69de29 diff --git a/config.py b/config.py new file mode 100644 index 0000000..09437ea --- /dev/null +++ b/config.py @@ -0,0 +1,32 @@ + +from os import environ + + +class Config: + """Bot configuration class.""" + + # Deployment + PORT = int(environ.get("PORT", 3978)) + + # Azure deployment + APP_ID = environ.get("MS_APP_ID", "") + APP_PASSWORD = environ.get("MS_APP_PASSWORD", "") + + # Models + MODEL_PREPROCESS = "en_core_web_sm" # SpaCy smallest model - For preprocess + MODEL_MATCHING = "TF-IDF" # PolyFuzz lightest model - Optimized for matching + MODEL_CLASSIFIER = "bert-base-uncased" # HuggingFace smallest BERT model - For tokenization and classifying + + # Remote files + s3_base_url = environ.get("S3_BASE_URL", "") + + weight_file = "resa_BERT_model.pt" + MODEL_WEIGHT_URL = f"{s3_base_url}/{weight_file}" # Fine-tuned weights for BERT model + MODEL_WEIGHT_LOCAL_COPY = f"./assets/model/{weight_file}" + + classes_file = "labels.pickle" + MODEL_CLASSES_URL = f"{s3_base_url}/{classes_file}" + MODEL_CLASSES_LOCAL_COPY = f"./assets/model/{classes_file}" + + # Filters + FILTERS_TOML = "./filters.toml" diff --git a/filters.toml b/filters.toml new file mode 100644 index 0000000..3c02fa0 --- /dev/null +++ b/filters.toml @@ -0,0 +1,15 @@ +# TOML document to store filters + +[longtalk_make_reservation] + + # Size of the room: How many people ? + [longtalk_make_reservation.people] + words = ["pearson", "people"] + regex = '''(?P\d)\W%s''' + threshold = 0.85 + + # Duration of the book: How long ? + [longtalk_make_reservation.duration] + words = ["day", "night"] + regex = '''(?P\d)\W%s''' + threshold = 0.85 diff --git a/main.py b/main.py new file mode 100644 index 0000000..6a24ddc --- /dev/null +++ b/main.py @@ -0,0 +1,114 @@ + +import sys +import traceback +from datetime import datetime +from http import HTTPStatus + +from aiohttp import web +from aiohttp.web import Request, Response, json_response +from botbuilder.core import BotFrameworkAdapterSettings, TurnContext, BotFrameworkAdapter, ConversationState, MemoryStorage, UserState +from botbuilder.core.integration import aiohttp_error_middleware +from botbuilder.schema import Activity, ActivityTypes + +from src.dialogs import MainDialog, BookingRoomDialog +from src.nlu import NLU +from src import Bot +from config import Config + +# Load the config and create the bot +config = Config() + +# Init a Bot adapter https://aka.ms/about-bot-adapter +settings = BotFrameworkAdapterSettings(config.APP_ID, config.APP_PASSWORD) +ADAPTER = BotFrameworkAdapter(settings) + + +# Catch-all for errors +async def on_error(context: TurnContext, error_: Exception): + """ + Catch-all functions to write out errors on console log. + NOTE: In production environment, logging should be done + to Azure application insights. + """ + + # Print the error into the logs + print(f"\n [on_turn_error] unhandled error: {error_}", file=sys.stderr) + traceback.print_exc() + + # Send a message to the user + await context.send_activity("The bot encountered an error or bug.") + + # If the bot is run from the Bot Framework Emulator (dev environment), + # print a more complete error log. + if context.activity.channel_id == "emulator": + + trace_activity = Activity( + label="TurnError", + name="on_turn_error Trace", + timestamp=datetime.utcnow(), + type=ActivityTypes.trace, + value=f"{error_}", + value_type="https://www.botframework.com/schemas/error", + ) + await context.send_activity(trace_activity) + + # Clear out state + await CONVERSATION_STATE.delete(context) + + +# Set the error handler on the Adapter. +ADAPTER.on_turn_error = on_error + +# Create MemoryStorage, UserState and ConversationState +MEMORY = MemoryStorage() +CONVERSATION_STATE = ConversationState(MEMORY) +USER_STATE = UserState(MEMORY) + +# Load the NLU recognizer +nlu = NLU() + +# Create the dialogs +dialog_room_reservation = BookingRoomDialog(nlu, USER_STATE) +dialog_main = MainDialog(nlu, USER_STATE, dialog_room_reservation) + +# Create the bot +bot = Bot(CONVERSATION_STATE, USER_STATE, dialog_main) + + +# Direct message API +async def messages(req: Request) -> Response: + """ + Main bot function: Listen for incoming API request. + Route: '/api/messages'. + """ + + # Filter only JSON requests + if "application/json" in req.headers["Content-Type"]: + body = await req.json() + else: + return Response(status=HTTPStatus.UNSUPPORTED_MEDIA_TYPE) + + # Deserialize the JSON + activity = Activity().deserialize(body) + + # Retrieve the authorization code if sent + auth_header = "" + if "Authorization" in req.headers: + auth_header = req.headers["Authorization"] + + # Call the bot and send back its response + response = await ADAPTER.process_activity(activity, auth_header, bot.on_turn) + if response: + return json_response(data=response.body, status=response.status) + + # Return HTTP-200 if no response is send back + return Response(status=HTTPStatus.OK) + +# Init and open routes for direct API call +app = web.Application(middlewares=[aiohttp_error_middleware]) +app.router.add_post("/api/messages", messages) + + +if __name__ == "__main__": + + web.run_app(app, host="0.0.0.0", port=config.PORT) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2d23dee --- /dev/null +++ b/requirements.txt @@ -0,0 +1,26 @@ + +# MS Bot Framework +botbuilder-core==4.11.0 +botbuilder-integration-aiohttp==4.11.0 +botbuilder-schema==4.11.0 +botframework-connector==4.11.0 +botbuilder-dialogs==4.11.0 +aiohttp==3.6.2 + +# Preprocessing +beautifulsoup4==4.9.3 +spacy==3.0.1 +Unidecode==1.1.2 +word2number==1.1 +contractions==0.0.45 + +# Classification +transformers==4.2.2 +torch==1.7.1 +requests==2.23.0 + +# Matching +polyfuzz==0.2.2 +toml==0.10.2 +pandas==1.2.1 + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..b00d0e0 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,4 @@ + +from .bot import Bot + +__all__ = ["Bot"] diff --git a/src/bot.py b/src/bot.py new file mode 100644 index 0000000..e486862 --- /dev/null +++ b/src/bot.py @@ -0,0 +1,39 @@ + +from botbuilder.schema import ChannelAccount +from botbuilder.core import ActivityHandler, TurnContext, ConversationState, UserState +from botbuilder.dialogs import Dialog + +from .dialogs.utils import Emoji +from .dialogs.helpers import DialogHelper + + +class Bot(ActivityHandler): + + def __init__(self, conversation_state: ConversationState, user_state: UserState, dialog: Dialog): + + self.conversation_state = conversation_state + self.user_state = user_state + self.dialog = dialog + + async def on_members_added_activity(self, members_added: [ChannelAccount], turn_context: TurnContext): + + # Send an "Hello" to any new user connected to the bot + for member in members_added: + if member.id != turn_context.activity.recipient.id: + await turn_context.send_activity(f"Hello {Emoji.WAVING_HAND.value}") + + async def on_turn(self, turn_context: TurnContext): + + await super().on_turn(turn_context) + + # Save any state changes that might have occurred during the turn. + await self.conversation_state.save_changes(turn_context) + await self.user_state.save_changes(turn_context) + + async def on_message_activity(self, turn_context: TurnContext): + + await DialogHelper.run_dialog( + self.dialog, + turn_context, + self.conversation_state.create_property("DialogState"), + ) diff --git a/src/bot/dialog.py b/src/bot/dialog.py new file mode 100644 index 0000000..0efb20c --- /dev/null +++ b/src/bot/dialog.py @@ -0,0 +1,49 @@ + +from botbuilder.core import ActivityHandler, ConversationState, TurnContext, UserState +from botbuilder.dialogs import Dialog +from helpers.dialog_helper import DialogHelper + + +class DialogBot(ActivityHandler): + """ + This Bot implementation can run any type of Dialog. The use of type parameterization is to allows multiple + different bots to be run at different endpoints within the same project. This can be achieved by defining distinct + Controller types each with dependency on distinct Bot types. The ConversationState is used by the Dialog system. The + UserState isn't, however, it might have been used in a Dialog implementation, and the requirement is that all + BotState objects are saved at the end of a turn. + """ + + def __init__( + self, + conversation_state: ConversationState, + user_state: UserState, + dialog: Dialog, + ): + if conversation_state is None: + raise TypeError( + "[DialogBot]: Missing parameter. conversation_state is required but None was given" + ) + if user_state is None: + raise TypeError( + "[DialogBot]: Missing parameter. user_state is required but None was given" + ) + if dialog is None: + raise Exception("[DialogBot]: Missing parameter. dialog is required") + + self.conversation_state = conversation_state + self.user_state = user_state + self.dialog = dialog + + async def on_turn(self, turn_context: TurnContext): + await super().on_turn(turn_context) + + # Save any state changes that might have ocurred during the turn. + await self.conversation_state.save_changes(turn_context) + await self.user_state.save_changes(turn_context) + + async def on_message_activity(self, turn_context: TurnContext): + await DialogHelper.run_dialog( + self.dialog, + turn_context, + self.conversation_state.create_property("DialogState"), + ) \ No newline at end of file diff --git a/src/dialogs/__init__.py b/src/dialogs/__init__.py new file mode 100644 index 0000000..a928908 --- /dev/null +++ b/src/dialogs/__init__.py @@ -0,0 +1,5 @@ + +from .booking_room_dialog import BookingRoomDialog +from .main_dialog import MainDialog + +__all__ = ["BookingRoomDialog", "MainDialog"] diff --git a/src/dialogs/booking_room_dialog.py b/src/dialogs/booking_room_dialog.py new file mode 100644 index 0000000..d24867e --- /dev/null +++ b/src/dialogs/booking_room_dialog.py @@ -0,0 +1,179 @@ + +from botbuilder.schema import ChannelAccount, CardAction, ActionTypes, SuggestedActions, Activity, ActivityTypes +from botbuilder.dialogs import ComponentDialog, WaterfallDialog, WaterfallStepContext, DialogTurnResult +from botbuilder.dialogs.prompts import TextPrompt, NumberPrompt, ChoicePrompt, ConfirmPrompt, AttachmentPrompt, PromptOptions, PromptValidatorContext +from botbuilder.dialogs.choices import Choice +from botbuilder.core import MessageFactory, UserState + +from src.nlu import Intent, NLU +from .utils import Emoji +from .helpers import NLUHelper +from .data_models import RoomReservation + + +class BookingRoomDialog(ComponentDialog): + + def __init__(self, nlu_recognizer: NLU, user_state: UserState): + super(BookingRoomDialog, self).__init__(BookingRoomDialog.__name__) + + # Load the NLU module + self._nlu_recognizer = nlu_recognizer + + # Load the RoomReservation class + self.room_reservation_accessor = user_state.create_property("RoomReservation") + + # Setup the waterfall dialog + self.add_dialog(WaterfallDialog("WFBookingDialog", [ + self.people_step, + self.duration_step, + self.breakfast_step, + self.summary_step, + ])) + + # Append the prompts and custom prompts + self.add_dialog(NumberPrompt("PeoplePrompt", BookingRoomDialog.people_prompt_validator)) + self.add_dialog(NumberPrompt("DurationPrompt", BookingRoomDialog.duration_prompt_validator)) + self.add_dialog(ConfirmPrompt("IsTakingBreakfastPrompt")) + + self.initial_dialog_id = "WFBookingDialog" + + @staticmethod + async def people_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """Ask the user: how many people to make the reservation?""" + + # Retrieve the booking keywords + booking_keywords: dict = step_context.options + step_context.values['booking_keywords'] = booking_keywords + + # If the keyword 'people' exists and is filled, pass the question + if 'people' in booking_keywords and booking_keywords['people'] is not None: + return await step_context.next(booking_keywords['people']) + + # Give user suggestions (1 or 2 people). + # The user can still write a custom number of people [1, 4]. + options = PromptOptions( + prompt=Activity( + + type=ActivityTypes.message, + text="Would you like a single or a double room?", + + suggested_actions=SuggestedActions( + actions=[ + CardAction( + title="Single", + type=ActionTypes.im_back, + value="Single room (1 people)" + ), + CardAction( + title="Double", + type=ActionTypes.im_back, + value="Double room (2 peoples)" + ) + ] + ) + ), + retry_prompt=MessageFactory.text( + "Reservations can be made for one to four people only." + ) + ) + + # NumberPrompt - How many people ? + return await step_context.prompt( + "PeoplePrompt", + options + ) + + @staticmethod + async def duration_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """Ask the user: how many night to reserve?""" + + # Save the number of people + step_context.values["people"] = step_context.result + + # Retrieve the keywords + booking_keywords: dict = step_context.values["booking_keywords"] + + # If the keyword 'duration' exists and is filled, pass the question + if 'duration' in booking_keywords and booking_keywords['duration'] is not None: + return await step_context.next(booking_keywords['duration']) + + # NumberPrompt - How many nights ? (duration) + return await step_context.prompt( + "DurationPrompt", + PromptOptions( + prompt=MessageFactory.text("How long do you want to stay?"), + retry_prompt=MessageFactory.text( + "It is only possible to book from 1 to 7 nights" + ), + ), + ) + + @staticmethod + async def breakfast_step(step_context: WaterfallStepContext) -> DialogTurnResult: + + # Save the number of nights + step_context.values["duration"] = step_context.result + + # Confirm people and duration + await step_context.context.send_activity( + MessageFactory.text( + f"Okay, so {step_context.values['people']} people for {step_context.values['duration']} nights" + ) + ) + + # ConfirmPrompt - Is taking breakfast ? + return await step_context.prompt( + "IsTakingBreakfastPrompt", + PromptOptions( + prompt=MessageFactory.text("Will you be having breakfast?") + ), + ) + + async def summary_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + + # Save if the user take the breakfast (bool) + step_context.values["breakfast"] = step_context.result + + # If the user said "Yes": + if step_context.result: + + # Confirm breakfast hour + await step_context.context.send_activity( + MessageFactory.text(f"Perfect, breakfast is from 6am to 10am") + ) + + # Save information to Reservation object + room_reservation = await self.room_reservation_accessor.get( + step_context.context, RoomReservation + ) + + room_reservation.people = step_context.values["people"] + room_reservation.duration = step_context.values["duration"] + room_reservation.breakfast = step_context.values["breakfast"] + + # End the dialog + await step_context.context.send_activity( + MessageFactory.text("Your booking has been made !") + ) + + return await step_context.end_dialog() + + @staticmethod + async def people_prompt_validator(prompt_context: PromptValidatorContext) -> bool: + """Validate the number of people entered by the user.""" + + # Restrict people between [1 and 4]. + return ( + prompt_context.recognized.succeeded + and 1 <= prompt_context.recognized.value <= 4 + ) + + @staticmethod + async def duration_prompt_validator(prompt_context: PromptValidatorContext) -> bool: + """Validate the number of nights entered by the user.""" + + # Restrict nights between [1 and 7]. + return ( + prompt_context.recognized.succeeded + and 1 <= prompt_context.recognized.value <= 7 + ) diff --git a/src/dialogs/data_models/__init__.py b/src/dialogs/data_models/__init__.py new file mode 100644 index 0000000..6b200e2 --- /dev/null +++ b/src/dialogs/data_models/__init__.py @@ -0,0 +1,4 @@ + +from .room_reservation import RoomReservation + +__all__ = ["RoomReservation"] diff --git a/src/dialogs/data_models/room_reservation.py b/src/dialogs/data_models/room_reservation.py new file mode 100644 index 0000000..1c31644 --- /dev/null +++ b/src/dialogs/data_models/room_reservation.py @@ -0,0 +1,9 @@ + +class RoomReservation: + """Hotel's room reservation state.""" + + def __init__(self, people: int = None, duration: int = None, breakfast: bool = None): + + self.people: int = people # Number of people + self.duration: int = duration # Number of nights + self.breakfast: bool = breakfast # If they take breakfast diff --git a/src/dialogs/helpers/__init__.py b/src/dialogs/helpers/__init__.py new file mode 100644 index 0000000..62e19ca --- /dev/null +++ b/src/dialogs/helpers/__init__.py @@ -0,0 +1,5 @@ + +from .dialogs_helper import DialogHelper +from .nlu_helper import NLUHelper + +__all__ = ["DialogHelper", "NLUHelper"] diff --git a/src/dialogs/helpers/dialogs_helper.py b/src/dialogs/helpers/dialogs_helper.py new file mode 100644 index 0000000..531b325 --- /dev/null +++ b/src/dialogs/helpers/dialogs_helper.py @@ -0,0 +1,18 @@ + +from botbuilder.core import StatePropertyAccessor, TurnContext +from botbuilder.dialogs import Dialog, DialogSet, DialogTurnStatus + + +class DialogHelper: + + @staticmethod + async def run_dialog(dialog: Dialog, turn_context: TurnContext, accessor: StatePropertyAccessor): + + dialog_set = DialogSet(accessor) + dialog_set.add(dialog) + + dialog_context = await dialog_set.create_context(turn_context) + results = await dialog_context.continue_dialog() + + if results.status == DialogTurnStatus.Empty: + await dialog_context.begin_dialog(dialog.id) diff --git a/src/dialogs/helpers/nlu_helper.py b/src/dialogs/helpers/nlu_helper.py new file mode 100644 index 0000000..0ece14a --- /dev/null +++ b/src/dialogs/helpers/nlu_helper.py @@ -0,0 +1,10 @@ + +from src.nlu import Intent, NLU + + +class NLUHelper: + + @staticmethod + async def execute_nlu_query(nlu_recognizer: NLU, message: str) -> (Intent, dict): + + return nlu_recognizer.get_intent(message) diff --git a/src/dialogs/main_dialog.py b/src/dialogs/main_dialog.py new file mode 100644 index 0000000..51706ac --- /dev/null +++ b/src/dialogs/main_dialog.py @@ -0,0 +1,98 @@ + +from botbuilder.schema import InputHints +from botbuilder.dialogs import ComponentDialog, WaterfallDialog, WaterfallStepContext, DialogTurnResult +from botbuilder.dialogs.prompts import TextPrompt, PromptOptions +from botbuilder.core import MessageFactory, UserState + +from src.nlu import Intent, NLU +from . import BookingRoomDialog +from .utils import Emoji +from .helpers import NLUHelper + + +class MainDialog(ComponentDialog): + + def __init__(self, nlu_recognizer: NLU, user_state: UserState, + booking_room_dialog: BookingRoomDialog): + + super(MainDialog, self).__init__(MainDialog.__name__) + + # Load the NLU module + self._nlu_recognizer = nlu_recognizer + + # Load the sub-dialogs + self._booking_dialog_id = booking_room_dialog.id + + # Setup the waterfall dialog + self.add_dialog(WaterfallDialog(WaterfallDialog.__name__, [ + self.intro_step, + self.act_step, + self.final_step + ])) + + # Append the prompts and custom dialogs, used in the waterfall + self.add_dialog(TextPrompt("ActPrompt")) + self.add_dialog(booking_room_dialog) + + self.initial_dialog_id = WaterfallDialog.__name__ + + @staticmethod + async def intro_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Intro step. Triggered upon any interaction from the user to this bot. + """ + + # Ask what to do + message = ( + str(step_context.options) + if step_context.options + else "What can I help you with today?" + ) + + # TextPromp - How can I help you ? + return await step_context.prompt( + "ActPrompt", + PromptOptions( + prompt=MessageFactory.text(message) + ), + ) + + async def act_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Act step. Take user response and infer its intention. + Dispatch to the desired sub-dialog + """ + + intent, keywords = await NLUHelper.execute_nlu_query( + self._nlu_recognizer, step_context.result + ) + + # Run the BookingRoomDialog, passing it keywords from nlu + if intent == Intent.BOOK_ROOM: + return await step_context.begin_dialog(self._booking_dialog_id, keywords) + + # If no intent was understood, return a didn't understand message + else: + didnt_understand_text = ( + "Sorry, I didn't get that. Please try asking in a different way" + ) + + await step_context.context.send_activity( + MessageFactory.text( + didnt_understand_text, didnt_understand_text, InputHints.ignoring_input + ) + ) + + return await step_context.next(None) + + async def final_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Final step. Triggered upon sub-dialog completion. Replace the current + dialog by the main dialog to start a new loop of conversation. + """ + + # Replace the current dialog back to main dialog + return await step_context.replace_dialog( + self.id, + "What else can I do for you?" + ) diff --git a/src/dialogs/utils/__init__.py b/src/dialogs/utils/__init__.py new file mode 100644 index 0000000..d43a4a5 --- /dev/null +++ b/src/dialogs/utils/__init__.py @@ -0,0 +1,4 @@ + +from .emoji import Emoji + +__all__ = ['Emoji'] diff --git a/src/dialogs/utils/emoji.py b/src/dialogs/utils/emoji.py new file mode 100644 index 0000000..3d1dae2 --- /dev/null +++ b/src/dialogs/utils/emoji.py @@ -0,0 +1,7 @@ + +from enum import Enum + + +class Emoji(Enum): + + WAVING_HAND = "\U0001F44B" diff --git a/src/nlu/__init__.py b/src/nlu/__init__.py new file mode 100644 index 0000000..8e8fbe6 --- /dev/null +++ b/src/nlu/__init__.py @@ -0,0 +1,5 @@ + +from .intent import Intent +from .nlu import NLU + +__all__ = ["NLU", "Intent"] diff --git a/src/nlu/classifying/__init__.py b/src/nlu/classifying/__init__.py new file mode 100644 index 0000000..453d49a --- /dev/null +++ b/src/nlu/classifying/__init__.py @@ -0,0 +1,4 @@ + +from .classifier import Classifier + +__all__ = ["Classifier"] diff --git a/src/nlu/classifying/classifier.py b/src/nlu/classifying/classifier.py new file mode 100644 index 0000000..26c0278 --- /dev/null +++ b/src/nlu/classifying/classifier.py @@ -0,0 +1,86 @@ + +import pickle + +import requests +import torch +from transformers import BertTokenizer, BertForSequenceClassification + +from src.nlu import Intent +from config import Config +config = Config() + +# Set the device to cpu +device = torch.device("cpu") + + +class Classifier: + + def __init__(self): + + # Load the classes and the model + self.labels = self._load_labels() + self.model = self._load_model() + + @staticmethod + def __load_remote_file(url: str, local: str): + + # Open the URL and a local file + with requests.get(url, stream=True) as response: + with open(local, 'wb') as handle: + + # Stream the model to the local file + for chunk in response.iter_content(chunk_size=8192): + handle.write(chunk) + + def _load_labels(self) -> dict: + """ + Load the dictionary labels from a remote pickle file and return it. + """ + + # Download and save the pickle locally + self.__load_remote_file(config.MODEL_CLASSES_URL, config.MODEL_CLASSES_LOCAL_COPY) + + # Load and return a dictionary + with open(config.MODEL_CLASSES_LOCAL_COPY, 'rb') as handle: + return pickle.load(handle) + + def _load_model(self) -> BertForSequenceClassification: + """ + Load the weight of the model from a remote file (around 500 Mo), + instantiate and return the model. + """ + + # Download and save the weights locally + self.__load_remote_file(config.MODEL_WEIGHT_URL, config.MODEL_WEIGHT_LOCAL_COPY) + + # Instantiate the model + model = BertForSequenceClassification.from_pretrained( + config.MODEL_CLASSIFIER, + num_labels=len(self.labels), + output_attentions=False, + output_hidden_states=False + ) + model.to(device) + + # Load and append the weights + model.load_state_dict( + torch.load(config.MODEL_WEIGHT_LOCAL_COPY, map_location=device) + ) + + return model + + def predict(self, dataset: BertTokenizer) -> Intent: + """Make a prediction and return the class.""" + + # Make the prediction, get an array of probabilities + probabilities = self.model( + input_ids=dataset.input_ids, + token_type_ids=None, + attention_mask=dataset.attention_mask + ) + + # Get the predicted class index + _, predicted_index = torch.max(probabilities[0], dim=1) + + # Return the intent + return Intent(self.labels[predicted_index[0].item()]) diff --git a/src/nlu/intent.py b/src/nlu/intent.py new file mode 100644 index 0000000..adce4a1 --- /dev/null +++ b/src/nlu/intent.py @@ -0,0 +1,15 @@ + +from enum import Enum + + +class Intent(Enum): + + # Yes/No + YES = "smalltalk_confirmation_yes" + NO = "smalltalk_confirmation_no" + + # Small talks + GREETINGS = "smalltalk_greetings_hello" + + # Hotel long talks + BOOK_ROOM = "longtalk_make_reservation" diff --git a/src/nlu/matching/__init__.py b/src/nlu/matching/__init__.py new file mode 100644 index 0000000..fa2ea14 --- /dev/null +++ b/src/nlu/matching/__init__.py @@ -0,0 +1,5 @@ + +from .filter import Filter +from .matcher import Matcher + +__all__ = ["Filter", "Matcher"] diff --git a/src/nlu/matching/filter.py b/src/nlu/matching/filter.py new file mode 100644 index 0000000..264b073 --- /dev/null +++ b/src/nlu/matching/filter.py @@ -0,0 +1,26 @@ + +from typing import List + + +class Filter: + """ + Filter object, storing values of a filter. Used in the Matcher + class in a RegEx to extract keywords from a given text. + """ + + def __init__(self, name: str, words: List[str], regex: str, threshold: float = 0.95): + + self.name = name + self.words = words + self.regex = self.set_regex(regex) + self.threshold = threshold + + @staticmethod + def set_regex(regex: str) -> str: + """ + Setter for _regex. Clean the regex string and remove double + backslash due to TOML file formatting. + """ + + regex.replace('\\\\', '\\') + return regex diff --git a/src/nlu/matching/matcher.py b/src/nlu/matching/matcher.py new file mode 100644 index 0000000..1f39ccc --- /dev/null +++ b/src/nlu/matching/matcher.py @@ -0,0 +1,89 @@ + +import re +from typing import Dict, List + +import toml +import pandas as pd +from polyfuzz import PolyFuzz + +from . import Filter +from config import Config +config = Config() + + +class Matcher: + + def __init__(self): + + # Load PolyFuzz model for matching. Default: TF-IDF + self.model = PolyFuzz(config.MODEL_MATCHING) + + # Load the filters + self.filters: Dict[str, List[Filter]] = self.__load_filters() + + @staticmethod + def __load_filters() -> dict: + """ + Load the filters from filters.toml (by default), create Filter + objects, and return a dictionary of these object classified by + intent. + """ + filters = {} + + # Load the raw filter + toml_file = toml.load(config.FILTERS_TOML, _dict=dict) + + # Loop over each intent + for intent, raw_filters in toml_file.items(): + filter_list = [] + + # Loop over each filter in this intent + for name, content in raw_filters.items(): + + # Create and append a Filter object + filter_list.append( + Filter( + name=name, + words=content['words'], + regex=content['regex'], + threshold=content['threshold'] + ) + ) + + # Save the filters to the main dictionary + filters[intent] = filter_list + + return filters + + def get_keywords(self, text: str, intent: str) -> dict: + + keywords = {} + if intent in self.filters: + + # Split the text into a list of words + entries = text.split(" ") + + for filter_ in self.filters[intent]: + + # Math similarities between the filter and the given text + self.model.match(entries, filter_.words) + matches: pd.DataFrame = self.model.get_matches() + + try: + # Get the word with the maximum similarity + thresholds = matches[matches['Similarity'] >= filter_.threshold] + keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0] + + except Exception: + # If there's no match, set the filter as None + keywords[filter_.name] = None + + else: + # Use the keyword to retrieve and save its chained-data + if result := re.search(filter_.regex % keyword, text): + keywords[filter_.name] = result.group(filter_.name) + + else: + keywords[filter_.name] = None + + return keywords diff --git a/src/nlu/nlu.py b/src/nlu/nlu.py new file mode 100644 index 0000000..c46a4e8 --- /dev/null +++ b/src/nlu/nlu.py @@ -0,0 +1,33 @@ + +from . import Intent +from .matching import Matcher +from .preprocessing import Preprocessor, Tokenizer +from .classifying import Classifier + + +class NLU: + + def __init__(self): + + # Preprocessing + self.preprocessor = Preprocessor() + self.tokenizer = Tokenizer() + + # Classifier + self.classifier = Classifier() + self.matcher = Matcher() + + def get_intent(self, message: str) -> (Intent, dict): + """ + Return the intention and the keywords of a given message. + """ + + # Clean the message and create a dataset of tokens + preprocessed_text = self.preprocessor.preprocess(message) + dataset = self.tokenizer.get_dataset(preprocessed_text) + + # Get the intention + intent = self.classifier.predict(dataset) + keywords = self.matcher.get_keywords(preprocessed_text, intent.value) + + return intent, keywords diff --git a/src/nlu/preprocessing/__init__.py b/src/nlu/preprocessing/__init__.py new file mode 100644 index 0000000..59813be --- /dev/null +++ b/src/nlu/preprocessing/__init__.py @@ -0,0 +1,5 @@ + +from .tokenizer import Tokenizer +from .preprocessor import Preprocessor + +__all__ = ["Tokenizer", "Preprocessor"] diff --git a/src/nlu/preprocessing/preprocessor.py b/src/nlu/preprocessing/preprocessor.py new file mode 100644 index 0000000..593f429 --- /dev/null +++ b/src/nlu/preprocessing/preprocessor.py @@ -0,0 +1,81 @@ + +from bs4 import BeautifulSoup +import spacy +import unidecode +from word2number import w2n +import contractions + +from config import Config +config = Config() + + +class Preprocessor: + + def __init__(self): + + # Load SpaCy model for preprocess. Default: en_core_web_sm + self.nlp = spacy.load(config.MODEL_PREPROCESS) + + @staticmethod + def strip_html_tags(text: str) -> str: + """Remove html tags from the document.""" + + soup = BeautifulSoup(text, "html.parser") + return soup.get_text(separator=" ") + + @staticmethod + def expand_contractions(text: str) -> str: + """Expand shortened words, e.g. 'don't' to 'do not'.""" + + return contractions.fix(text) + + @staticmethod + def remove_accented_chars(text: str) -> str: + """Remove accented characters from text, e.g. café.""" + + return unidecode.unidecode(text) + + @staticmethod + def remove_whitespace(text: str) -> str: + """Remove extra whitespaces from text.""" + + text = text.strip() + return " ".join(text.split()) + + @staticmethod + def limit_n_words(text: str, limit: int = 256): + """Limit a text to n-words. Default: 256.""" + + text = text.split()[:limit] + return " ".join(text) + + def preprocess(self, text: str) -> str: + """Apply a preprocess pipeline to a given text.""" + + # Apply all preformatting + text = self.strip_html_tags(text) + text = self.expand_contractions(text) + text = self.remove_accented_chars(text) + text = self.expand_contractions(text) + text = self.limit_n_words(text) + text = text.lower() + + # Tokenize the text + document = self.nlp(text) + clean_text = [] + + for token in document: + + # Convert number words to numeric numbers + if token.pos_ == 'NUM': + clean_text.append(w2n.word_to_num(token.text)) + + # Convert tokens to base form + elif token.lemma_ != "-PRON-": + clean_text.append(token.lemma_) + + # Append the token if no modification was applied + else: + clean_text.append(token) + + return ' '.join(map(str, clean_text)) diff --git a/src/nlu/preprocessing/tokenizer.py b/src/nlu/preprocessing/tokenizer.py new file mode 100644 index 0000000..d9b8909 --- /dev/null +++ b/src/nlu/preprocessing/tokenizer.py @@ -0,0 +1,17 @@ + +from transformers import BertTokenizer + +from config import Config +config = Config() + + +class Tokenizer: + + def __init__(self): + self.tokenizer = BertTokenizer.from_pretrained(config.MODEL_CLASSIFIER) + + def get_dataset(self, text: str): + """Return a torch Dataset from a given text.""" + + # Tokenize the text and return it + return self.tokenizer(text, return_tensors="pt")