diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4c3fa87 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ + +FROM python:3.8-slim-buster + +# Install the security updates. +RUN apt-get update +RUN apt-get -y upgrade + +# Dependencies to build requires packages +RUN apt-get -y install gcc + +# Remove all cached file. Get a smaller image. +RUN apt-get clean +RUN rm -rf /var/lib/apt/lists/* + +EXPOSE 3978 + +# Copy the application. +COPY . /opt/app +WORKDIR /opt/app + +# Install the app librairies. +RUN pip install -r requirements.txt + +# Install SpaCy small model +RUN python -m spacy download en_core_web_sm + +# Start the app. +ENTRYPOINT [ "python" ] +CMD [ "main.py" ] \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..69d5e44 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Joffrey Bienvenu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..63a9fee --- /dev/null +++ b/README.md @@ -0,0 +1,80 @@ +# chatbot + + + + + +## Cross-plateforme implementation + + +## fonctionnalités du Bot : + +### Traitement des inputs "utilisateurs" + +- Bert ? + +### Possible réponses + +- Accueillire + - Décrire les fonctionnalités du bot + - Répondre +- Donner les heures d'ouvertures +- Afficher une liste d'objet, de produit + +### Nice to have + +- Réserver un service / un produit + - Gérer un agenda + - vérifier la disponibilité + - +- + + "**In English**, the bot should be able to :\n", + "\n", + "- Understand phrases related to a room reservation.\n", + "Example that the bot will have to understand: \n", + "\n", + "\t\t> I wish to reserve a room for 2 people.\n", + "\t\t> I wish to reserve a room for 4 days\n", + "\t\t> Do you have rooms available from July 23rd?\n", + "\t\t> I would like to reserve a room for two days and for two people\n", + "\n", + "- Understand phrases related to a table reservation for the restaurant. \n", + "\n", + "\t\t> I would like to make a reservation for tonight.\n", + "\t\t> I'd like to reserve a table for four people.\n", + "\n", + "- Must ensure a continuous and ongoing conversation. Example of a complete conversation : \n", + "\n", + "\t\t> Customer : Hello !\n", + "\t\t> Bot : Hello, how can I help you? \n", + "\t\t> Customer: I would like to reserve a table for 4 people ? \n", + "\t\t> Bot : For which date would you like to reserve your table?\n", + "\t\t> Customer : Today at 7:00 pm\n", + "\t\t> Bot : What name should I make the reservation under?\n", + "\t\t> Customer : My name is Mr. Dupont! \n", + "\t\t> Bot : Very well Mr Dupont, I confirm you the reservation of a table for 4 people tonight at 7:00 pm. \n", + "\t\t> Bot : Can I help you with something else?\n", + "\t\t> Customer : No thanks\n", + "\t\t> Bot: Have a nice day. \n", + "\n", + "- Understand when the client is angry. In this case, the bot will indicate that it is transmitting the conversation to a human. \n", + "\n", + "\t\t> You're incompetent!\n", + "\t\t> My room is dirty! This is outrageous!\n", + "\t\t> I want to talk to a human. \n", + "\n", + "### Nice-to-have features\n", + "- Create an API of your bot to make it cross-platform \n", + "- Use Docker\n", + + + + +## Hébergement du Bot + +Timeline: +- Etablir l'objectif (déployer bot cross-plateforme + créer propre modele) +- Trouver un framework >> MSBotFramework +- Créer un dataset +- Deployer dummy bot \ No newline at end of file diff --git a/assets/conversation_simple.svg b/assets/conversation_simple.svg new file mode 100644 index 0000000..4bc7af6 --- /dev/null +++ b/assets/conversation_simple.svg @@ -0,0 +1,3 @@ + + +
Bonjour, je voudrais réserver une chambre
Bonjour, je voudrais réserver une chambre
Bonjour, combien de personnes ?
Bonjour, combien de personnes ?
3
3
Combien de nuits ?
Combien de nuits ?
2
2
Reservation effectuée, bonne journée !
Reservation effectuée, bonne journée !
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/assets/images/profile_large.png b/assets/images/profile_large.png new file mode 100644 index 0000000..d1f5247 Binary files /dev/null and b/assets/images/profile_large.png differ diff --git a/assets/images/profile_small.png b/assets/images/profile_small.png new file mode 100644 index 0000000..d86ebfb Binary files /dev/null and b/assets/images/profile_small.png differ diff --git a/assets/model/.empty b/assets/model/.empty new file mode 100644 index 0000000..e69de29 diff --git a/config.py b/config.py new file mode 100644 index 0000000..09437ea --- /dev/null +++ b/config.py @@ -0,0 +1,32 @@ + +from os import environ + + +class Config: + """Bot configuration class.""" + + # Deployment + PORT = int(environ.get("PORT", 3978)) + + # Azure deployment + APP_ID = environ.get("MS_APP_ID", "") + APP_PASSWORD = environ.get("MS_APP_PASSWORD", "") + + # Models + MODEL_PREPROCESS = "en_core_web_sm" # SpaCy smallest model - For preprocess + MODEL_MATCHING = "TF-IDF" # PolyFuzz lightest model - Optimized for matching + MODEL_CLASSIFIER = "bert-base-uncased" # HuggingFace smallest BERT model - For tokenization and classifying + + # Remote files + s3_base_url = environ.get("S3_BASE_URL", "") + + weight_file = "resa_BERT_model.pt" + MODEL_WEIGHT_URL = f"{s3_base_url}/{weight_file}" # Fine-tuned weights for BERT model + MODEL_WEIGHT_LOCAL_COPY = f"./assets/model/{weight_file}" + + classes_file = "labels.pickle" + MODEL_CLASSES_URL = f"{s3_base_url}/{classes_file}" + MODEL_CLASSES_LOCAL_COPY = f"./assets/model/{classes_file}" + + # Filters + FILTERS_TOML = "./filters.toml" diff --git a/filters.toml b/filters.toml new file mode 100644 index 0000000..3c02fa0 --- /dev/null +++ b/filters.toml @@ -0,0 +1,15 @@ +# TOML document to store filters + +[longtalk_make_reservation] + + # Size of the room: How many people ? + [longtalk_make_reservation.people] + words = ["pearson", "people"] + regex = '''(?P\d)\W%s''' + threshold = 0.85 + + # Duration of the book: How long ? + [longtalk_make_reservation.duration] + words = ["day", "night"] + regex = '''(?P\d)\W%s''' + threshold = 0.85 diff --git a/main.py b/main.py new file mode 100644 index 0000000..6a24ddc --- /dev/null +++ b/main.py @@ -0,0 +1,114 @@ + +import sys +import traceback +from datetime import datetime +from http import HTTPStatus + +from aiohttp import web +from aiohttp.web import Request, Response, json_response +from botbuilder.core import BotFrameworkAdapterSettings, TurnContext, BotFrameworkAdapter, ConversationState, MemoryStorage, UserState +from botbuilder.core.integration import aiohttp_error_middleware +from botbuilder.schema import Activity, ActivityTypes + +from src.dialogs import MainDialog, BookingRoomDialog +from src.nlu import NLU +from src import Bot +from config import Config + +# Load the config and create the bot +config = Config() + +# Init a Bot adapter https://aka.ms/about-bot-adapter +settings = BotFrameworkAdapterSettings(config.APP_ID, config.APP_PASSWORD) +ADAPTER = BotFrameworkAdapter(settings) + + +# Catch-all for errors +async def on_error(context: TurnContext, error_: Exception): + """ + Catch-all functions to write out errors on console log. + NOTE: In production environment, logging should be done + to Azure application insights. + """ + + # Print the error into the logs + print(f"\n [on_turn_error] unhandled error: {error_}", file=sys.stderr) + traceback.print_exc() + + # Send a message to the user + await context.send_activity("The bot encountered an error or bug.") + + # If the bot is run from the Bot Framework Emulator (dev environment), + # print a more complete error log. + if context.activity.channel_id == "emulator": + + trace_activity = Activity( + label="TurnError", + name="on_turn_error Trace", + timestamp=datetime.utcnow(), + type=ActivityTypes.trace, + value=f"{error_}", + value_type="https://www.botframework.com/schemas/error", + ) + await context.send_activity(trace_activity) + + # Clear out state + await CONVERSATION_STATE.delete(context) + + +# Set the error handler on the Adapter. +ADAPTER.on_turn_error = on_error + +# Create MemoryStorage, UserState and ConversationState +MEMORY = MemoryStorage() +CONVERSATION_STATE = ConversationState(MEMORY) +USER_STATE = UserState(MEMORY) + +# Load the NLU recognizer +nlu = NLU() + +# Create the dialogs +dialog_room_reservation = BookingRoomDialog(nlu, USER_STATE) +dialog_main = MainDialog(nlu, USER_STATE, dialog_room_reservation) + +# Create the bot +bot = Bot(CONVERSATION_STATE, USER_STATE, dialog_main) + + +# Direct message API +async def messages(req: Request) -> Response: + """ + Main bot function: Listen for incoming API request. + Route: '/api/messages'. + """ + + # Filter only JSON requests + if "application/json" in req.headers["Content-Type"]: + body = await req.json() + else: + return Response(status=HTTPStatus.UNSUPPORTED_MEDIA_TYPE) + + # Deserialize the JSON + activity = Activity().deserialize(body) + + # Retrieve the authorization code if sent + auth_header = "" + if "Authorization" in req.headers: + auth_header = req.headers["Authorization"] + + # Call the bot and send back its response + response = await ADAPTER.process_activity(activity, auth_header, bot.on_turn) + if response: + return json_response(data=response.body, status=response.status) + + # Return HTTP-200 if no response is send back + return Response(status=HTTPStatus.OK) + +# Init and open routes for direct API call +app = web.Application(middlewares=[aiohttp_error_middleware]) +app.router.add_post("/api/messages", messages) + + +if __name__ == "__main__": + + web.run_app(app, host="0.0.0.0", port=config.PORT) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2d23dee --- /dev/null +++ b/requirements.txt @@ -0,0 +1,26 @@ + +# MS Bot Framework +botbuilder-core==4.11.0 +botbuilder-integration-aiohttp==4.11.0 +botbuilder-schema==4.11.0 +botframework-connector==4.11.0 +botbuilder-dialogs==4.11.0 +aiohttp==3.6.2 + +# Preprocessing +beautifulsoup4==4.9.3 +spacy==3.0.1 +Unidecode==1.1.2 +word2number==1.1 +contractions==0.0.45 + +# Classification +transformers==4.2.2 +torch==1.7.1 +requests==2.23.0 + +# Matching +polyfuzz==0.2.2 +toml==0.10.2 +pandas==1.2.1 + diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..b00d0e0 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,4 @@ + +from .bot import Bot + +__all__ = ["Bot"] diff --git a/src/bot.py b/src/bot.py new file mode 100644 index 0000000..e486862 --- /dev/null +++ b/src/bot.py @@ -0,0 +1,39 @@ + +from botbuilder.schema import ChannelAccount +from botbuilder.core import ActivityHandler, TurnContext, ConversationState, UserState +from botbuilder.dialogs import Dialog + +from .dialogs.utils import Emoji +from .dialogs.helpers import DialogHelper + + +class Bot(ActivityHandler): + + def __init__(self, conversation_state: ConversationState, user_state: UserState, dialog: Dialog): + + self.conversation_state = conversation_state + self.user_state = user_state + self.dialog = dialog + + async def on_members_added_activity(self, members_added: [ChannelAccount], turn_context: TurnContext): + + # Send an "Hello" to any new user connected to the bot + for member in members_added: + if member.id != turn_context.activity.recipient.id: + await turn_context.send_activity(f"Hello {Emoji.WAVING_HAND.value}") + + async def on_turn(self, turn_context: TurnContext): + + await super().on_turn(turn_context) + + # Save any state changes that might have occurred during the turn. + await self.conversation_state.save_changes(turn_context) + await self.user_state.save_changes(turn_context) + + async def on_message_activity(self, turn_context: TurnContext): + + await DialogHelper.run_dialog( + self.dialog, + turn_context, + self.conversation_state.create_property("DialogState"), + ) diff --git a/src/bot/dialog.py b/src/bot/dialog.py new file mode 100644 index 0000000..0efb20c --- /dev/null +++ b/src/bot/dialog.py @@ -0,0 +1,49 @@ + +from botbuilder.core import ActivityHandler, ConversationState, TurnContext, UserState +from botbuilder.dialogs import Dialog +from helpers.dialog_helper import DialogHelper + + +class DialogBot(ActivityHandler): + """ + This Bot implementation can run any type of Dialog. The use of type parameterization is to allows multiple + different bots to be run at different endpoints within the same project. This can be achieved by defining distinct + Controller types each with dependency on distinct Bot types. The ConversationState is used by the Dialog system. The + UserState isn't, however, it might have been used in a Dialog implementation, and the requirement is that all + BotState objects are saved at the end of a turn. + """ + + def __init__( + self, + conversation_state: ConversationState, + user_state: UserState, + dialog: Dialog, + ): + if conversation_state is None: + raise TypeError( + "[DialogBot]: Missing parameter. conversation_state is required but None was given" + ) + if user_state is None: + raise TypeError( + "[DialogBot]: Missing parameter. user_state is required but None was given" + ) + if dialog is None: + raise Exception("[DialogBot]: Missing parameter. dialog is required") + + self.conversation_state = conversation_state + self.user_state = user_state + self.dialog = dialog + + async def on_turn(self, turn_context: TurnContext): + await super().on_turn(turn_context) + + # Save any state changes that might have ocurred during the turn. + await self.conversation_state.save_changes(turn_context) + await self.user_state.save_changes(turn_context) + + async def on_message_activity(self, turn_context: TurnContext): + await DialogHelper.run_dialog( + self.dialog, + turn_context, + self.conversation_state.create_property("DialogState"), + ) \ No newline at end of file diff --git a/src/dialogs/__init__.py b/src/dialogs/__init__.py new file mode 100644 index 0000000..a928908 --- /dev/null +++ b/src/dialogs/__init__.py @@ -0,0 +1,5 @@ + +from .booking_room_dialog import BookingRoomDialog +from .main_dialog import MainDialog + +__all__ = ["BookingRoomDialog", "MainDialog"] diff --git a/src/dialogs/booking_room_dialog.py b/src/dialogs/booking_room_dialog.py new file mode 100644 index 0000000..d24867e --- /dev/null +++ b/src/dialogs/booking_room_dialog.py @@ -0,0 +1,179 @@ + +from botbuilder.schema import ChannelAccount, CardAction, ActionTypes, SuggestedActions, Activity, ActivityTypes +from botbuilder.dialogs import ComponentDialog, WaterfallDialog, WaterfallStepContext, DialogTurnResult +from botbuilder.dialogs.prompts import TextPrompt, NumberPrompt, ChoicePrompt, ConfirmPrompt, AttachmentPrompt, PromptOptions, PromptValidatorContext +from botbuilder.dialogs.choices import Choice +from botbuilder.core import MessageFactory, UserState + +from src.nlu import Intent, NLU +from .utils import Emoji +from .helpers import NLUHelper +from .data_models import RoomReservation + + +class BookingRoomDialog(ComponentDialog): + + def __init__(self, nlu_recognizer: NLU, user_state: UserState): + super(BookingRoomDialog, self).__init__(BookingRoomDialog.__name__) + + # Load the NLU module + self._nlu_recognizer = nlu_recognizer + + # Load the RoomReservation class + self.room_reservation_accessor = user_state.create_property("RoomReservation") + + # Setup the waterfall dialog + self.add_dialog(WaterfallDialog("WFBookingDialog", [ + self.people_step, + self.duration_step, + self.breakfast_step, + self.summary_step, + ])) + + # Append the prompts and custom prompts + self.add_dialog(NumberPrompt("PeoplePrompt", BookingRoomDialog.people_prompt_validator)) + self.add_dialog(NumberPrompt("DurationPrompt", BookingRoomDialog.duration_prompt_validator)) + self.add_dialog(ConfirmPrompt("IsTakingBreakfastPrompt")) + + self.initial_dialog_id = "WFBookingDialog" + + @staticmethod + async def people_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """Ask the user: how many people to make the reservation?""" + + # Retrieve the booking keywords + booking_keywords: dict = step_context.options + step_context.values['booking_keywords'] = booking_keywords + + # If the keyword 'people' exists and is filled, pass the question + if 'people' in booking_keywords and booking_keywords['people'] is not None: + return await step_context.next(booking_keywords['people']) + + # Give user suggestions (1 or 2 people). + # The user can still write a custom number of people [1, 4]. + options = PromptOptions( + prompt=Activity( + + type=ActivityTypes.message, + text="Would you like a single or a double room?", + + suggested_actions=SuggestedActions( + actions=[ + CardAction( + title="Single", + type=ActionTypes.im_back, + value="Single room (1 people)" + ), + CardAction( + title="Double", + type=ActionTypes.im_back, + value="Double room (2 peoples)" + ) + ] + ) + ), + retry_prompt=MessageFactory.text( + "Reservations can be made for one to four people only." + ) + ) + + # NumberPrompt - How many people ? + return await step_context.prompt( + "PeoplePrompt", + options + ) + + @staticmethod + async def duration_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """Ask the user: how many night to reserve?""" + + # Save the number of people + step_context.values["people"] = step_context.result + + # Retrieve the keywords + booking_keywords: dict = step_context.values["booking_keywords"] + + # If the keyword 'duration' exists and is filled, pass the question + if 'duration' in booking_keywords and booking_keywords['duration'] is not None: + return await step_context.next(booking_keywords['duration']) + + # NumberPrompt - How many nights ? (duration) + return await step_context.prompt( + "DurationPrompt", + PromptOptions( + prompt=MessageFactory.text("How long do you want to stay?"), + retry_prompt=MessageFactory.text( + "It is only possible to book from 1 to 7 nights" + ), + ), + ) + + @staticmethod + async def breakfast_step(step_context: WaterfallStepContext) -> DialogTurnResult: + + # Save the number of nights + step_context.values["duration"] = step_context.result + + # Confirm people and duration + await step_context.context.send_activity( + MessageFactory.text( + f"Okay, so {step_context.values['people']} people for {step_context.values['duration']} nights" + ) + ) + + # ConfirmPrompt - Is taking breakfast ? + return await step_context.prompt( + "IsTakingBreakfastPrompt", + PromptOptions( + prompt=MessageFactory.text("Will you be having breakfast?") + ), + ) + + async def summary_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + + # Save if the user take the breakfast (bool) + step_context.values["breakfast"] = step_context.result + + # If the user said "Yes": + if step_context.result: + + # Confirm breakfast hour + await step_context.context.send_activity( + MessageFactory.text(f"Perfect, breakfast is from 6am to 10am") + ) + + # Save information to Reservation object + room_reservation = await self.room_reservation_accessor.get( + step_context.context, RoomReservation + ) + + room_reservation.people = step_context.values["people"] + room_reservation.duration = step_context.values["duration"] + room_reservation.breakfast = step_context.values["breakfast"] + + # End the dialog + await step_context.context.send_activity( + MessageFactory.text("Your booking has been made !") + ) + + return await step_context.end_dialog() + + @staticmethod + async def people_prompt_validator(prompt_context: PromptValidatorContext) -> bool: + """Validate the number of people entered by the user.""" + + # Restrict people between [1 and 4]. + return ( + prompt_context.recognized.succeeded + and 1 <= prompt_context.recognized.value <= 4 + ) + + @staticmethod + async def duration_prompt_validator(prompt_context: PromptValidatorContext) -> bool: + """Validate the number of nights entered by the user.""" + + # Restrict nights between [1 and 7]. + return ( + prompt_context.recognized.succeeded + and 1 <= prompt_context.recognized.value <= 7 + ) diff --git a/src/dialogs/data_models/__init__.py b/src/dialogs/data_models/__init__.py new file mode 100644 index 0000000..6b200e2 --- /dev/null +++ b/src/dialogs/data_models/__init__.py @@ -0,0 +1,4 @@ + +from .room_reservation import RoomReservation + +__all__ = ["RoomReservation"] diff --git a/src/dialogs/data_models/room_reservation.py b/src/dialogs/data_models/room_reservation.py new file mode 100644 index 0000000..1c31644 --- /dev/null +++ b/src/dialogs/data_models/room_reservation.py @@ -0,0 +1,9 @@ + +class RoomReservation: + """Hotel's room reservation state.""" + + def __init__(self, people: int = None, duration: int = None, breakfast: bool = None): + + self.people: int = people # Number of people + self.duration: int = duration # Number of nights + self.breakfast: bool = breakfast # If they take breakfast diff --git a/src/dialogs/helpers/__init__.py b/src/dialogs/helpers/__init__.py new file mode 100644 index 0000000..62e19ca --- /dev/null +++ b/src/dialogs/helpers/__init__.py @@ -0,0 +1,5 @@ + +from .dialogs_helper import DialogHelper +from .nlu_helper import NLUHelper + +__all__ = ["DialogHelper", "NLUHelper"] diff --git a/src/dialogs/helpers/dialogs_helper.py b/src/dialogs/helpers/dialogs_helper.py new file mode 100644 index 0000000..531b325 --- /dev/null +++ b/src/dialogs/helpers/dialogs_helper.py @@ -0,0 +1,18 @@ + +from botbuilder.core import StatePropertyAccessor, TurnContext +from botbuilder.dialogs import Dialog, DialogSet, DialogTurnStatus + + +class DialogHelper: + + @staticmethod + async def run_dialog(dialog: Dialog, turn_context: TurnContext, accessor: StatePropertyAccessor): + + dialog_set = DialogSet(accessor) + dialog_set.add(dialog) + + dialog_context = await dialog_set.create_context(turn_context) + results = await dialog_context.continue_dialog() + + if results.status == DialogTurnStatus.Empty: + await dialog_context.begin_dialog(dialog.id) diff --git a/src/dialogs/helpers/nlu_helper.py b/src/dialogs/helpers/nlu_helper.py new file mode 100644 index 0000000..0ece14a --- /dev/null +++ b/src/dialogs/helpers/nlu_helper.py @@ -0,0 +1,10 @@ + +from src.nlu import Intent, NLU + + +class NLUHelper: + + @staticmethod + async def execute_nlu_query(nlu_recognizer: NLU, message: str) -> (Intent, dict): + + return nlu_recognizer.get_intent(message) diff --git a/src/dialogs/main_dialog.py b/src/dialogs/main_dialog.py new file mode 100644 index 0000000..51706ac --- /dev/null +++ b/src/dialogs/main_dialog.py @@ -0,0 +1,98 @@ + +from botbuilder.schema import InputHints +from botbuilder.dialogs import ComponentDialog, WaterfallDialog, WaterfallStepContext, DialogTurnResult +from botbuilder.dialogs.prompts import TextPrompt, PromptOptions +from botbuilder.core import MessageFactory, UserState + +from src.nlu import Intent, NLU +from . import BookingRoomDialog +from .utils import Emoji +from .helpers import NLUHelper + + +class MainDialog(ComponentDialog): + + def __init__(self, nlu_recognizer: NLU, user_state: UserState, + booking_room_dialog: BookingRoomDialog): + + super(MainDialog, self).__init__(MainDialog.__name__) + + # Load the NLU module + self._nlu_recognizer = nlu_recognizer + + # Load the sub-dialogs + self._booking_dialog_id = booking_room_dialog.id + + # Setup the waterfall dialog + self.add_dialog(WaterfallDialog(WaterfallDialog.__name__, [ + self.intro_step, + self.act_step, + self.final_step + ])) + + # Append the prompts and custom dialogs, used in the waterfall + self.add_dialog(TextPrompt("ActPrompt")) + self.add_dialog(booking_room_dialog) + + self.initial_dialog_id = WaterfallDialog.__name__ + + @staticmethod + async def intro_step(step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Intro step. Triggered upon any interaction from the user to this bot. + """ + + # Ask what to do + message = ( + str(step_context.options) + if step_context.options + else "What can I help you with today?" + ) + + # TextPromp - How can I help you ? + return await step_context.prompt( + "ActPrompt", + PromptOptions( + prompt=MessageFactory.text(message) + ), + ) + + async def act_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Act step. Take user response and infer its intention. + Dispatch to the desired sub-dialog + """ + + intent, keywords = await NLUHelper.execute_nlu_query( + self._nlu_recognizer, step_context.result + ) + + # Run the BookingRoomDialog, passing it keywords from nlu + if intent == Intent.BOOK_ROOM: + return await step_context.begin_dialog(self._booking_dialog_id, keywords) + + # If no intent was understood, return a didn't understand message + else: + didnt_understand_text = ( + "Sorry, I didn't get that. Please try asking in a different way" + ) + + await step_context.context.send_activity( + MessageFactory.text( + didnt_understand_text, didnt_understand_text, InputHints.ignoring_input + ) + ) + + return await step_context.next(None) + + async def final_step(self, step_context: WaterfallStepContext) -> DialogTurnResult: + """ + Final step. Triggered upon sub-dialog completion. Replace the current + dialog by the main dialog to start a new loop of conversation. + """ + + # Replace the current dialog back to main dialog + return await step_context.replace_dialog( + self.id, + "What else can I do for you?" + ) diff --git a/src/dialogs/utils/__init__.py b/src/dialogs/utils/__init__.py new file mode 100644 index 0000000..d43a4a5 --- /dev/null +++ b/src/dialogs/utils/__init__.py @@ -0,0 +1,4 @@ + +from .emoji import Emoji + +__all__ = ['Emoji'] diff --git a/src/dialogs/utils/emoji.py b/src/dialogs/utils/emoji.py new file mode 100644 index 0000000..3d1dae2 --- /dev/null +++ b/src/dialogs/utils/emoji.py @@ -0,0 +1,7 @@ + +from enum import Enum + + +class Emoji(Enum): + + WAVING_HAND = "\U0001F44B" diff --git a/src/nlu/__init__.py b/src/nlu/__init__.py new file mode 100644 index 0000000..8e8fbe6 --- /dev/null +++ b/src/nlu/__init__.py @@ -0,0 +1,5 @@ + +from .intent import Intent +from .nlu import NLU + +__all__ = ["NLU", "Intent"] diff --git a/src/nlu/classifying/__init__.py b/src/nlu/classifying/__init__.py new file mode 100644 index 0000000..453d49a --- /dev/null +++ b/src/nlu/classifying/__init__.py @@ -0,0 +1,4 @@ + +from .classifier import Classifier + +__all__ = ["Classifier"] diff --git a/src/nlu/classifying/classifier.py b/src/nlu/classifying/classifier.py new file mode 100644 index 0000000..26c0278 --- /dev/null +++ b/src/nlu/classifying/classifier.py @@ -0,0 +1,86 @@ + +import pickle + +import requests +import torch +from transformers import BertTokenizer, BertForSequenceClassification + +from src.nlu import Intent +from config import Config +config = Config() + +# Set the device to cpu +device = torch.device("cpu") + + +class Classifier: + + def __init__(self): + + # Load the classes and the model + self.labels = self._load_labels() + self.model = self._load_model() + + @staticmethod + def __load_remote_file(url: str, local: str): + + # Open the URL and a local file + with requests.get(url, stream=True) as response: + with open(local, 'wb') as handle: + + # Stream the model to the local file + for chunk in response.iter_content(chunk_size=8192): + handle.write(chunk) + + def _load_labels(self) -> dict: + """ + Load the dictionary labels from a remote pickle file and return it. + """ + + # Download and save the pickle locally + self.__load_remote_file(config.MODEL_CLASSES_URL, config.MODEL_CLASSES_LOCAL_COPY) + + # Load and return a dictionary + with open(config.MODEL_CLASSES_LOCAL_COPY, 'rb') as handle: + return pickle.load(handle) + + def _load_model(self) -> BertForSequenceClassification: + """ + Load the weight of the model from a remote file (around 500 Mo), + instantiate and return the model. + """ + + # Download and save the weights locally + self.__load_remote_file(config.MODEL_WEIGHT_URL, config.MODEL_WEIGHT_LOCAL_COPY) + + # Instantiate the model + model = BertForSequenceClassification.from_pretrained( + config.MODEL_CLASSIFIER, + num_labels=len(self.labels), + output_attentions=False, + output_hidden_states=False + ) + model.to(device) + + # Load and append the weights + model.load_state_dict( + torch.load(config.MODEL_WEIGHT_LOCAL_COPY, map_location=device) + ) + + return model + + def predict(self, dataset: BertTokenizer) -> Intent: + """Make a prediction and return the class.""" + + # Make the prediction, get an array of probabilities + probabilities = self.model( + input_ids=dataset.input_ids, + token_type_ids=None, + attention_mask=dataset.attention_mask + ) + + # Get the predicted class index + _, predicted_index = torch.max(probabilities[0], dim=1) + + # Return the intent + return Intent(self.labels[predicted_index[0].item()]) diff --git a/src/nlu/intent.py b/src/nlu/intent.py new file mode 100644 index 0000000..adce4a1 --- /dev/null +++ b/src/nlu/intent.py @@ -0,0 +1,15 @@ + +from enum import Enum + + +class Intent(Enum): + + # Yes/No + YES = "smalltalk_confirmation_yes" + NO = "smalltalk_confirmation_no" + + # Small talks + GREETINGS = "smalltalk_greetings_hello" + + # Hotel long talks + BOOK_ROOM = "longtalk_make_reservation" diff --git a/src/nlu/matching/__init__.py b/src/nlu/matching/__init__.py new file mode 100644 index 0000000..fa2ea14 --- /dev/null +++ b/src/nlu/matching/__init__.py @@ -0,0 +1,5 @@ + +from .filter import Filter +from .matcher import Matcher + +__all__ = ["Filter", "Matcher"] diff --git a/src/nlu/matching/filter.py b/src/nlu/matching/filter.py new file mode 100644 index 0000000..264b073 --- /dev/null +++ b/src/nlu/matching/filter.py @@ -0,0 +1,26 @@ + +from typing import List + + +class Filter: + """ + Filter object, storing values of a filter. Used in the Matcher + class in a RegEx to extract keywords from a given text. + """ + + def __init__(self, name: str, words: List[str], regex: str, threshold: float = 0.95): + + self.name = name + self.words = words + self.regex = self.set_regex(regex) + self.threshold = threshold + + @staticmethod + def set_regex(regex: str) -> str: + """ + Setter for _regex. Clean the regex string and remove double + backslash due to TOML file formatting. + """ + + regex.replace('\\\\', '\\') + return regex diff --git a/src/nlu/matching/matcher.py b/src/nlu/matching/matcher.py new file mode 100644 index 0000000..1f39ccc --- /dev/null +++ b/src/nlu/matching/matcher.py @@ -0,0 +1,89 @@ + +import re +from typing import Dict, List + +import toml +import pandas as pd +from polyfuzz import PolyFuzz + +from . import Filter +from config import Config +config = Config() + + +class Matcher: + + def __init__(self): + + # Load PolyFuzz model for matching. Default: TF-IDF + self.model = PolyFuzz(config.MODEL_MATCHING) + + # Load the filters + self.filters: Dict[str, List[Filter]] = self.__load_filters() + + @staticmethod + def __load_filters() -> dict: + """ + Load the filters from filters.toml (by default), create Filter + objects, and return a dictionary of these object classified by + intent. + """ + filters = {} + + # Load the raw filter + toml_file = toml.load(config.FILTERS_TOML, _dict=dict) + + # Loop over each intent + for intent, raw_filters in toml_file.items(): + filter_list = [] + + # Loop over each filter in this intent + for name, content in raw_filters.items(): + + # Create and append a Filter object + filter_list.append( + Filter( + name=name, + words=content['words'], + regex=content['regex'], + threshold=content['threshold'] + ) + ) + + # Save the filters to the main dictionary + filters[intent] = filter_list + + return filters + + def get_keywords(self, text: str, intent: str) -> dict: + + keywords = {} + if intent in self.filters: + + # Split the text into a list of words + entries = text.split(" ") + + for filter_ in self.filters[intent]: + + # Math similarities between the filter and the given text + self.model.match(entries, filter_.words) + matches: pd.DataFrame = self.model.get_matches() + + try: + # Get the word with the maximum similarity + thresholds = matches[matches['Similarity'] >= filter_.threshold] + keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0] + + except Exception: + # If there's no match, set the filter as None + keywords[filter_.name] = None + + else: + # Use the keyword to retrieve and save its chained-data + if result := re.search(filter_.regex % keyword, text): + keywords[filter_.name] = result.group(filter_.name) + + else: + keywords[filter_.name] = None + + return keywords diff --git a/src/nlu/nlu.py b/src/nlu/nlu.py new file mode 100644 index 0000000..c46a4e8 --- /dev/null +++ b/src/nlu/nlu.py @@ -0,0 +1,33 @@ + +from . import Intent +from .matching import Matcher +from .preprocessing import Preprocessor, Tokenizer +from .classifying import Classifier + + +class NLU: + + def __init__(self): + + # Preprocessing + self.preprocessor = Preprocessor() + self.tokenizer = Tokenizer() + + # Classifier + self.classifier = Classifier() + self.matcher = Matcher() + + def get_intent(self, message: str) -> (Intent, dict): + """ + Return the intention and the keywords of a given message. + """ + + # Clean the message and create a dataset of tokens + preprocessed_text = self.preprocessor.preprocess(message) + dataset = self.tokenizer.get_dataset(preprocessed_text) + + # Get the intention + intent = self.classifier.predict(dataset) + keywords = self.matcher.get_keywords(preprocessed_text, intent.value) + + return intent, keywords diff --git a/src/nlu/preprocessing/__init__.py b/src/nlu/preprocessing/__init__.py new file mode 100644 index 0000000..59813be --- /dev/null +++ b/src/nlu/preprocessing/__init__.py @@ -0,0 +1,5 @@ + +from .tokenizer import Tokenizer +from .preprocessor import Preprocessor + +__all__ = ["Tokenizer", "Preprocessor"] diff --git a/src/nlu/preprocessing/preprocessor.py b/src/nlu/preprocessing/preprocessor.py new file mode 100644 index 0000000..593f429 --- /dev/null +++ b/src/nlu/preprocessing/preprocessor.py @@ -0,0 +1,81 @@ + +from bs4 import BeautifulSoup +import spacy +import unidecode +from word2number import w2n +import contractions + +from config import Config +config = Config() + + +class Preprocessor: + + def __init__(self): + + # Load SpaCy model for preprocess. Default: en_core_web_sm + self.nlp = spacy.load(config.MODEL_PREPROCESS) + + @staticmethod + def strip_html_tags(text: str) -> str: + """Remove html tags from the document.""" + + soup = BeautifulSoup(text, "html.parser") + return soup.get_text(separator=" ") + + @staticmethod + def expand_contractions(text: str) -> str: + """Expand shortened words, e.g. 'don't' to 'do not'.""" + + return contractions.fix(text) + + @staticmethod + def remove_accented_chars(text: str) -> str: + """Remove accented characters from text, e.g. café.""" + + return unidecode.unidecode(text) + + @staticmethod + def remove_whitespace(text: str) -> str: + """Remove extra whitespaces from text.""" + + text = text.strip() + return " ".join(text.split()) + + @staticmethod + def limit_n_words(text: str, limit: int = 256): + """Limit a text to n-words. Default: 256.""" + + text = text.split()[:limit] + return " ".join(text) + + def preprocess(self, text: str) -> str: + """Apply a preprocess pipeline to a given text.""" + + # Apply all preformatting + text = self.strip_html_tags(text) + text = self.expand_contractions(text) + text = self.remove_accented_chars(text) + text = self.expand_contractions(text) + text = self.limit_n_words(text) + text = text.lower() + + # Tokenize the text + document = self.nlp(text) + clean_text = [] + + for token in document: + + # Convert number words to numeric numbers + if token.pos_ == 'NUM': + clean_text.append(w2n.word_to_num(token.text)) + + # Convert tokens to base form + elif token.lemma_ != "-PRON-": + clean_text.append(token.lemma_) + + # Append the token if no modification was applied + else: + clean_text.append(token) + + return ' '.join(map(str, clean_text)) diff --git a/src/nlu/preprocessing/tokenizer.py b/src/nlu/preprocessing/tokenizer.py new file mode 100644 index 0000000..d9b8909 --- /dev/null +++ b/src/nlu/preprocessing/tokenizer.py @@ -0,0 +1,17 @@ + +from transformers import BertTokenizer + +from config import Config +config = Config() + + +class Tokenizer: + + def __init__(self): + self.tokenizer = BertTokenizer.from_pretrained(config.MODEL_CLASSIFIER) + + def get_dataset(self, text: str): + """Return a torch Dataset from a given text.""" + + # Tokenize the text and return it + return self.tokenizer(text, return_tensors="pt")