Source code for mindmeld.converter.dialogflow

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module contains the DialogflowConverter class used to convert Dialogflow projects
into MindMeld projects"""

import json
import logging
import os
import re
import importlib.util

from shutil import copyfile
from mindmeld.converter.converter import Converter
from mindmeld.converter.code_generator import MindmeldCodeGenerator
from mindmeld.components._config import DEFAULT_INTENT_CLASSIFIER_CONFIG

logger = logging.getLogger(__name__)
package_dir = os.path.dirname(os.path.abspath(__file__))


[docs]class DialogflowConverter(Converter):
    """The class is a sub class of the abstract Converter class. This class
    contains the methods required to convert a Dialogflow project into a MindMeld project
    """

    sys_entity_map = {
        "@sys.date-time": "sys_interval",
        "@sys.date": "sys_time",
        "@sys.date-period": "sys_interval",
        "@sys.time": "sys_time",
        "@sys.time-period": "sys_duration",
        "@sys.duration": "sys_duration",
        "@sys.number": "sys_number",
        "@sys.cardinal": "sys_number",
        "@sys.ordinal": "sys_ordinal",
        "@sys.unit-currency": "sys_amount-of-money",
        "@sys.unit-volume": "sys_volume",
        "@sys.email": "sys_email",
        "@sys.phone-number": "sys_phone-number",
        "@sys.url": "sys_url",
        "@sys.temperature": "sys_temperature",
    }

    # TODO: provide support for entities listed in sys_entity_map_todo
    sys_entity_map_todo = [
        "@sys.number-integer",
        "@sys.number-sequence",
        "@sys.flight-number",
        "@sys.unit-area",
        "@sys.unit-length",
        "@sys.unit-speed",
        "@sys.unit-information",
        "@sys.percentage",
        "@sys.age",
        "@sys.currency-name",
        "@sys.unit-area-name",
        "@sys.unit-length-name",
        "@sys.unit-speed-name",
        "@sys.unit-volume-name",
        "@sys.unit-weight-name",
        "@sys.unit-information-name",
        "@sys.address",
        "@sys.zip-code",
        "@sys.geo-capital",
        "@sys.geo-country",
        "@sys.geo-country-code",
        "@sys.geo-city",
        "@sys.geo-state",
        "@sys.geo-city",
        "@sys.geo-state",
        "@sys.place-attraction",
        "@sys.airport",
        "@sys.location",
        "@sys.given-name",
        "@sys.last-name",
        "@sys.person",
        "@sys.music-artist",
        "@sys.music-genre",
        "@sys.color",
        "@sys.language",
        "@sys.any",
    ]

    def __init__(
        self,
        dialogflow_project_directory,
        mindmeld_project_directory,
        custom_config_file_path=None,
        language="en",
    ):
        if os.path.exists(os.path.dirname(dialogflow_project_directory)):
            self.dialogflow_project_directory = dialogflow_project_directory
            self.mindmeld_project_directory = mindmeld_project_directory
            self.directory = os.path.dirname(os.path.realpath(__file__))
            self.entities_list = set()
            self.intents_list = set()
            self.code_gen = MindmeldCodeGenerator()
            self.custom_config_file_path = custom_config_file_path
            self.language = language
        else:
            msg = "`{dialogflow_project_directory}` does not exist. Please verify."
            msg = msg.format(dialogflow_project_directory=dialogflow_project_directory)
            raise FileNotFoundError(msg)

[docs]    def create_mindmeld_directory(self):
        self.create_directory(self.mindmeld_project_directory)
        self.create_directory(os.path.join(self.mindmeld_project_directory, "data"))
        self.create_directory(os.path.join(self.mindmeld_project_directory, "domains"))
        self.create_directory(
            os.path.join(self.mindmeld_project_directory, "domains", "app_specific")
        )
        self.create_directory(
            os.path.join(self.mindmeld_project_directory, "domains", "unrelated")
        )
        self.create_directory(os.path.join(self.mindmeld_project_directory, "entities"))

    # =========================
    # create training data (entities, intents)
    # =========================

    def _create_entities_directories(self, entities):
        """Creates directories + files for all languages/files.
        Currently does not use meta data in entityName.json files (the keys in var entities).
        """
        for languages in entities.values():
            for sub in languages.values():

                if sub != self.language:
                    # Each MindMeld app works on one language
                    continue

                dialogflow_entity_file = os.path.join(
                    self.dialogflow_project_directory, "entities", sub + ".json"
                )

                mindmeld_entity_directory_name = self.clean_check(
                    sub, self.entities_list
                )

                mindmeld_entity_directory = os.path.join(
                    self.mindmeld_project_directory,
                    "entities",
                    mindmeld_entity_directory_name,
                )

                # remove DF entity reference "entries"
                mindmeld_entity_directory = mindmeld_entity_directory.replace(
                    "entries_", ""
                )
                self.create_directory(mindmeld_entity_directory)
                self._create_entity_file(
                    dialogflow_entity_file, mindmeld_entity_directory
                )

    @staticmethod
    def _create_entity_file(dialogflow_entity_file, mindmeld_entity_directory):
        source_en = open(dialogflow_entity_file, "r")
        target_gazetteer = open(
            os.path.join(mindmeld_entity_directory, "gazetteer.txt"), "w"
        )
        target_mapping = open(
            os.path.join(mindmeld_entity_directory, "mapping.json"), "w"
        )

        datastore = json.load(source_en)
        mapping_dict = {"entities": []}

        for item in datastore:
            new_dict = {}
            while ("value" in item) and (item["value"] in item["synonyms"]):
                item["synonyms"].remove(item["value"])
            new_dict["whitelist"] = item["synonyms"]
            new_dict["cname"] = item["value"]
            mapping_dict["entities"].append(new_dict)

            target_gazetteer.write(item["value"] + "\n")

        json.dump(mapping_dict, target_mapping, ensure_ascii=False, indent=2)

        source_en.close()
        target_gazetteer.close()
        target_mapping.close()

    def _create_intents_directories(self, intents):
        """ Creates directories + files for all languages/files."""

        for languages in intents.values():
            for language, sub in languages.items():

                if language != self.language:
                    # Each MindMeld app works on one language
                    continue

                dialogflow_intent_file = os.path.join(
                    self.dialogflow_project_directory, "intents", sub + ".json"
                )

                mindmeld_intent_directory_name = self.clean_check(
                    sub, self.intents_list
                )

                # DF has "default" intents like "default_fallback" and "default_greeting"
                # which are in-built intents. We map these intents to the "unrelated" domain
                # compared to the other app specific intents being mapped to the "app_specific"
                # domain.
                if "default" in mindmeld_intent_directory_name:
                    domain = "unrelated"
                else:
                    domain = "app_specific"

                mindmeld_intent_directory = os.path.join(
                    self.mindmeld_project_directory,
                    "domains",
                    domain,
                    mindmeld_intent_directory_name,
                )

                # remove DF intent reference "usersays_"
                mindmeld_intent_directory = mindmeld_intent_directory.replace(
                    "usersays_", ""
                )
                self.create_directory(mindmeld_intent_directory)
                self._create_intent_file(
                    dialogflow_intent_file, mindmeld_intent_directory, language
                )

    def _create_intent_file(
        self, dialogflow_intent_file, mindmeld_intent_directory, language
    ):
        source_en = open(dialogflow_intent_file, "r")
        target_train = open(os.path.join(mindmeld_intent_directory, "train.txt"), "w")
        datastore = json.load(source_en)
        all_text = []
        default_intent_to_training_file = {
            "default_fallback_intent": "unrelated.txt",
            "default_welcome_intent": "greetings.txt",
        }

        for usersay in datastore:
            sentence = ""
            for texts in usersay["data"]:
                df_text = texts["text"]
                if "meta" in texts and texts["meta"] != "@sys.ignore":
                    df_meta = texts["meta"]
                    role_type = texts["alias"].replace("-", "_")

                    if re.match(
                        "(@sys.).+", df_meta
                    ):  # if text is a dialogflow sys entity
                        if df_meta in DialogflowConverter.sys_entity_map:
                            mm_meta = DialogflowConverter.sys_entity_map[df_meta]
                            entity_type = mm_meta
                        else:
                            mm_meta = "[DNE: {sysEntity}]".format(sysEntity=df_meta[1:])
                            logger.info(
                                "Unfortunately mindmeld does not currently support"
                                "%s as a sys entity."
                                "Please create an entity for this.",
                                df_meta[1:],
                            )
                            entity_type = self.clean_name(mm_meta) + "_" + language

                        part = "{" + df_text + "|" + entity_type + "|" + role_type + "}"
                    else:
                        entity_type = self.clean_name(df_meta[1:]) + "_" + language
                        part = "{" + df_text + "|" + entity_type + "|" + role_type + "}"
                else:
                    part = df_text

                sentence += part
            all_text.append(sentence)

        for key in default_intent_to_training_file:
            if key in mindmeld_intent_directory:
                with open(
                    os.path.join(package_dir, default_intent_to_training_file[key])
                ) as fp:
                    for line in fp:
                        all_text.append(line.strip())

        # Double the size of the training set if there are less than the number of
        # folds for cross-val in the config.py file
        intent_config = DEFAULT_INTENT_CLASSIFIER_CONFIG
        if self.custom_config_file_path:
            config_path = os.path.join(self.mindmeld_project_directory, "config.py")
            spec = importlib.util.spec_from_file_location("mindmeld_app", config_path)
            config = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(config)
            intent_config = getattr(config, "INTENT_RECOGNIZER_CONFIG", intent_config)

        while len(all_text) < intent_config["param_selection"]["k"]:
            all_text = all_text * 2

        target_train.write("\n".join(all_text))
        source_en.close()
        target_train.close()

    def _get_file_names(self, level):
        """Gets the names of the entities from Dialogflow as a dictionary.
        levels (str): either "entities" or "intents"

        ex. if we had the following files in our entities directory:
            ["test.json", "test_entries_en.json", "test_entries_de.json"]
        it returns:
            {'test': {'en': 'test_entries_en', 'de': 'test_entries_de'}}"""

        directory = os.path.join(self.dialogflow_project_directory, level)
        files = os.listdir(directory)

        w = {"entities": "entries", "intents": "usersays"}
        p = r".+(?<=(_" + w[level] + "_))(.*)(?=(.json))"
        language = "en"

        info = {}
        for name in files:
            match = re.match(p, name)

            if match:
                isbase = False
                base = name[: match.start(1)]
                language = str(match.group(2))
            else:
                isbase = True
                base = name[:-5]

            if base not in info:
                info[base] = {}

            if not isbase:
                info[base][language] = name[:-5]

        return info

[docs]    def create_mindmeld_training_data(self):
        entities = self._get_file_names("entities")
        self._create_entities_directories(entities)

        intents = self._get_file_names("intents")
        self._create_intents_directories(intents)

    # =========================
    # create init
    # =========================

[docs]    @staticmethod
    def clean_name(name):
        """ Takes in a string and returns a valid folder name (no spaces, all lowercase)."""
        name = re.sub(r"[^\w\s-]", "", name).strip().lower()
        name = re.sub(r"[-\s]+", "_", name)
        return name

[docs]    def clean_check(self, name, lst):
        """Takes in a list of strings and a name.
        Returns name cleaned if the cleaned name is not found in lst."""
        cleaned = self.clean_name(name)

        if cleaned not in lst:
            lst.add(cleaned)
            return cleaned
        else:
            logger.error(
                "%s name has been created twice. Please ensure there "
                "are no duplicate names in the dialogflow files and "
                "filenames are valid (no spaces or special characters)",
                cleaned,
            )

[docs]    def create_mindmeld_init(self):
        with open(
            os.path.join(self.mindmeld_project_directory, "__init__.py"), "w"
        ) as target:

            self.code_gen.begin(tab="    ")
            self.code_gen.generate_top_block()

            intents = self._get_file_names("intents")

            for main in intents:

                df_main = os.path.join(
                    self.dialogflow_project_directory, "intents", main + ".json"
                )

                with open(df_main) as source:
                    if "usersays" in df_main:
                        logger.error(
                            "Please check if your intent file"
                            "names are correctly labeled."
                        )
                        return

                    datastore = json.load(source)
                    intent = self.clean_name(datastore["name"])
                    for response in datastore["responses"]:
                        self.generate_handlers(intent, response)

            target.write(self.code_gen.end())
            target.write("\n")

[docs]    def generate_handlers(self, intent, response):
        message = response["messages"][0]
        language = message["lang"]
        intent_lang = "%s_%s" % (intent, language)
        intent_entity_role_replies = {intent_lang: {}}

        for param in response["parameters"]:
            if param["required"]:
                entity = param["dataType"]
                if entity in DialogflowConverter.sys_entity_map:
                    entity = DialogflowConverter.sys_entity_map[entity]
                else:
                    entity = param["dataType"].replace("@", "").replace("-", "_")
                    entity = "%s_%s" % (entity, language)
                role = param["name"].replace("@", "").replace("-", "_")

                prompts = []
                if "prompts" in param:
                    prompts = [x["value"] for x in param["prompts"]]
                else:
                    prompts = ["What is the " + param["name"]]

                if entity in intent_entity_role_replies[intent_lang]:
                    intent_entity_role_replies[intent_lang][entity][role] = prompts
                else:
                    intent_entity_role_replies[intent_lang][entity] = {role: prompts}

        if "speech" in message:
            data = message["speech"]
            replies = data if isinstance(data, list) else [data]
            slot_templated_replies = []

            is_slot_template = False
            for resp in replies:
                template = resp
                slots = re.findall("\$([\w\-\_]+)", resp)
                for slot in slots:
                    template = template.replace(
                        "$" + slot, "{" + slot.replace("-", "_") + "}"
                    )
                if template != resp:
                    is_slot_template = True
                slot_templated_replies.append(template)

            handle = "intent='%s_%s'" % (intent, language)
            function_name = intent + "_" + language + "_handler"

            if is_slot_template:
                self.code_gen.generate_followup_function_code_block(
                    handle,
                    function_name,
                    intent_entity_role_replies,
                    slot_templated_replies,
                )
            else:
                self.code_gen.generate_function(
                    handle=handle,
                    function_name=function_name,
                    replies=replies,
                )

    # =========================
    # convert project
    # =========================

[docs]    def convert_project(self):
        """Converts a Dialogflow project into a MindMeld project.

        Dialogflow projects consist of entities and intents.
            note on languages:
                Dialogflow supports multiple languages and locales. They store their training
                data for different languages in different files. So, the name of each training
                file ends with a meta tag, two letters long for language, and an additional
                two letters for dialect (if applicable). For example, a file ending in "_en-au"
                indicates it's in English (Australia). Below we use "la" to represent this
                meta tag.

            entities folder contains:
                entityName.json - Meta data about entityName for all languages.
                entityName_la.json - One for each language, contains entitiy mappings.

            intents folder contain:
                intentName.json - Contains rules, information about conversation flow, meta data.
                    Contains previously mentioned information and responses for all languages.
                intentName_usersays_la.json - one for each language,
                    contains training data to recognize intentName

        Limitations:
        - The converter is unable to create an entity when it encounters an
        unrecognized entity (an entity not defined under entities folder
         or system entities), and labels such entities as DNE in training data.
        - The converter currently does not automatically convert features like
        slot filling, contexts, and follow-up intents. Users can still implement such
        features and more.
        - Information in agent.json are not copied over.
        - There is no official support for different languages. Users can still
        implement this. The converter is able to successfully convert dialogflow
        bots that support multiple languages.

        MindMeld:
        - Users can store data locally
        - Users can build a knowledge base (currently beta in Dialogflow).
        - Users can configure the machine learning models to best suit their needs.
        - Users have more flexibility in defining their own features, including
         ones like slot filling, contexts, and follow-up intents.
        """

        logger.info("Converting project.")

        # Create project directory with sub folders
        self.create_mindmeld_directory()

        # copy config file to the MindMeld dir
        if self.custom_config_file_path:
            copyfile(
                self.custom_config_file_path,
                os.path.join(self.mindmeld_project_directory, "config.py"),
            )

        file_loc = os.path.dirname(os.path.realpath(__file__))
        self.create_main(self.mindmeld_project_directory, file_loc)
        self.create_mindmeld_init()

        # Transfer over test data from Dialogflow project and reformat to MindMeld project
        self.create_mindmeld_training_data()
        logger.info("Project converted.")