Source code for mindmeld.components.parser

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This module contains the language parser component of the MindMeld natural language processor
"""
import logging
import time
from collections import OrderedDict, defaultdict, namedtuple

from nltk import FeatureChartParser
from nltk.featstruct import Feature
from nltk.grammar import FeatureGrammar

from .. import path
from ..core import Span
from ..exceptions import ParserTimeout
from ._config import get_parser_config

logger = logging.getLogger(__name__)

START_SYMBOL = "S"
HEAD_SYMBOL = "H"

TYPE_FEATURE = Feature("type", display="prefix")

START_SYMBOLS = frozenset({START_SYMBOL, HEAD_SYMBOL})

MAX_PARSE_TIME = 2.0


[docs]class Parser:
    """
    A language parser which is used to extract relations between entities in a
    given query and group related entities together.

    The parser uses a context free grammar based on a configuration to generate
    candidate entity groupings. Heuristics are then used to rank and select a
    grouping.

    This rule based parser will be helpful in many situations, but if you have
    a sufficiently sophisticated entity hierarchy, you may benefit from using a
    statistical approach.

    Attributes:
        config (dict): The parser config.
    """

    def __init__(
        self,
        resource_loader=None,
        config=None,
        allow_relaxed=True,
        domain=None,
        intent=None,
    ):
        """Initializes the parser

        Args:
            resource_loader (ResourceLoader): An object which can load
                resources for the parser.
            config (dict, optional): The configuration for the parser. If none
                is provided the app config will be loaded.
        """
        if not resource_loader and not config:
            raise ValueError(
                "Parser requires either a configuration or a resource loader"
            )
        app_path = resource_loader.app_path if resource_loader else None
        try:
            entity_types = path.get_entity_types(app_path) + ["unk"]
        except TypeError:
            entity_types = {"unk"}
        self._resource_loader = resource_loader
        self.config = get_parser_config(app_path, config, domain, intent) or {}
        configured_entities = set()
        for entity_type, entity_config in self.config.items():
            configured_entities.add(entity_type)
            configured_entities.update(entity_config.keys())

        self._configured_entities = configured_entities
        rules = generate_grammar(self.config, entity_types)
        self._grammar = FeatureGrammar.fromstring(rules)
        self._parser = FeatureChartParser(self._grammar)
        if allow_relaxed:
            relaxed_rules = generate_grammar(self.config, entity_types, relaxed=True)
            self._relaxed_grammar = FeatureGrammar.fromstring(relaxed_rules)
            self._relaxed_parser = FeatureChartParser(self._relaxed_grammar)
        else:
            self._relaxed_grammar = None
            self._relaxed_parser = None

[docs]    def parse_entities(
        self,
        query,
        entities,
        all_candidates=False,
        handle_timeout=True,
        timeout=MAX_PARSE_TIME,
    ):
        """Determines groupings of entities for the given query.

        Args:
            query (Query): The query being parsed.
            entities (list[QueryEntity]): The entities to find groupings for.
            all_candidates (bool, optional): Whether to return all the entity candidates.
            handle_timeout (bool, optional): False if an exception should be raised in the event of
                a parsing times out. Defaults to True.
            timeout (float, optional): The amount of time to wait for the parsing to complete.
                By default this is set to MAX_PARSE_TIME. If None is passed, the passing will never
                time out

        Returns:
            (tuple[QueryEntity]): An updated version of the entities collection passed in with \
                their parent and children attributes set appropriately.
        """
        if not self._configured_entities:
            return entities

        if not handle_timeout:
            return self._parse(
                query, entities, all_candidates=all_candidates, timeout=timeout
            )

        try:
            return self._parse(
                query, entities, all_candidates=all_candidates, timeout=timeout
            )
        except ParserTimeout:
            logger.warning("Parser timed out parsing query %r", query.text)
            return entities

    def _parse(self, query, entities, all_candidates, timeout):
        entity_type_count = defaultdict(int)
        entity_dict = {}
        tokens = []  # tokens to be parsed

        # generate sentential form (assumes entities are sorted)
        for entity in entities:
            entity_type = entity.entity.type
            role_type = entity.entity.role
            if role_type:
                # Append role type to entity type with - separator
                entity_with_role_type = entity_type + "--" + role_type
                if entity_with_role_type in self._configured_entities:
                    entity_type = entity_with_role_type
            if entity_type not in self._configured_entities:
                entity_type = "unk"
            entity_id = "{}{}".format(entity_type, entity_type_count[entity_type])
            entity_type_count[entity_type] += 1
            entity_dict[entity_id] = entity
            tokens.append(entity_id)

        logger.debug("Parsing sentential form: %r", " ".join(tokens))
        start_time = time.time()
        parses = []
        for parse in self._parser.parse(tokens):
            parses.append(parse)
            if timeout is not None and (time.time() - start_time) > timeout:
                raise ParserTimeout("Parsing took too long")

        if not parses and self._relaxed_parser:
            for parse in self._relaxed_parser.parse(tokens):
                parses.append(parse)
                if timeout is not None and (time.time() - start_time) > MAX_PARSE_TIME:
                    raise ParserTimeout("Parsing took too long")

        if not parses:
            if all_candidates:
                return []
            return entities

        ranked_parses = self._rank_parses(
            query, entity_dict, parses, timeout, start_time
        )
        if all_candidates:
            return ranked_parses

        # if we still have more than one, choose the first
        entities = self._get_flat_entities(ranked_parses[0], entities, entity_dict)
        return tuple(sorted(entities, key=lambda e: e.span.start))

    def _rank_parses(self, query, entity_dict, parses, timeout, start_time=None):
        start_time = start_time or time.time()
        resolved = OrderedDict()

        for parse in parses:
            if timeout is not None and time.time() - start_time > timeout:
                raise ParserTimeout("Parsing took too long")
            resolved[self._resolve_parse(parse)] = None
        filtered = (p for p in resolved.keys())

        # Prefer parses with fewer groups
        parses = list(sorted(filtered, key=len))
        filtered = (p for p in parses if len(p) <= len(parses[0]))

        # Prefer parses with minimal distance from dependents to heads
        parses = list(
            sorted(filtered, key=lambda p: self._parse_distance(p, query, entity_dict))
        )
        min_parse_dist = self._parse_distance(parses[0], query, entity_dict)
        filtered = (
            p
            for p in parses
            if self._parse_distance(p, query, entity_dict) <= min_parse_dist
        )

        # TODO: apply precedence

        return list(filtered)

    def _parse_distance(self, parse, query, entity_dict):
        total_link_distance = 0
        stack = list(parse)
        while stack:
            node = stack.pop()
            head = entity_dict[node.id]
            for dep in node.dependents or set():
                if dep.dependents:
                    stack.append(dep)
                    continue
                child = entity_dict[dep.id]
                if child.token_span.start > head.token_span.start:
                    intra_entity_span = Span(
                        head.token_span.end, child.token_span.start
                    )
                else:
                    intra_entity_span = Span(
                        child.token_span.end, head.token_span.start
                    )
                link_distance = 0
                for token in intra_entity_span.slice(query.text.split(" ")):
                    if token in self.config[node.type][dep.type]["linking_words"]:
                        link_distance -= 0.5
                    else:
                        link_distance += 1
                total_link_distance += link_distance

        return total_link_distance

    @staticmethod
    def _get_flat_entities(parse, entities, entity_dict):
        stack = [g.to_query_entity(entity_dict) for g in parse]
        new_dict = {}
        while stack:
            entity = stack.pop()
            new_dict[(entity.entity.type, entity.span.start)] = entity

            for child in entity.children or ():
                stack.append(child)

        return [new_dict.get((e.entity.type, e.span.start), e) for e in entities]

    @classmethod
    def _resolve_parse(cls, node):
        groups = set()
        for child in node:
            child_symbol = child.label()[TYPE_FEATURE]
            if child_symbol in START_SYMBOLS:
                groups.update(cls._resolve_parse(child))
            else:
                group = cls._resolve_group(child).freeze()
                groups.add(group)
        return frozenset(groups)

    @classmethod
    def _resolve_group(cls, node):
        symbol = node.label()[TYPE_FEATURE]
        if not symbol[0].isupper():
            # this node is a generic entity of type {symbol}, its child is the terminal
            return _EntityNode(symbol, node[0], None)

        # if first char is capitalized, this is a group!
        group_type = symbol.lower()
        dependents = set()
        for child in node:
            child_symbol = child.label()[TYPE_FEATURE]
            if child_symbol == symbol:
                # this is the ancestor of this group
                group = cls._resolve_group(child)
            elif child_symbol == group_type:
                # this is the root ancestor of this group
                group = cls._resolve_group(child)
                group = _EntityNode(group.type, group.id, set())
            else:
                dependents.add(cls._resolve_group(child).freeze())

        group.dependents.update(dependents)
        return group


class _EntityNode(namedtuple("EntityNode", ("type", "id", "dependents"))):
    """A private tree data structure used to parse queries

    EntityNodes use sets and are conditionally hashable. This makes it easy to check the
    equivalence of parse trees represented as entity nodes.
    """

    def freeze(self):
        """Converts to a 'frozen' representation that can be hashed"""
        if self.dependents is None:
            return self

        frozen_dependents = frozenset((d.freeze() for d in self.dependents))
        return _EntityNode(self.type, self.id, frozen_dependents)

    def pretty(self, indent=0):
        """Pretty prints the entity node.

        Primarily useful for debugging."""
        text = ("  " * indent) + self.id

        if not self.dependents:
            return text

        return (
            text + "\n" + "\n".join(dep.pretty(indent + 1) for dep in self.dependents)
        )

    def to_query_entity(self, entity_dict, is_root=True):
        """Converts a node to an QueryEntity

        Args:
            entity_dict (dict): A mapping from entity ids to the corresponding
                original QueryEntity objects
        """
        if not self.dependents and not is_root:
            return entity_dict[self.id]

        head = entity_dict[self.id]
        if self.dependents is None:
            return head
        dependents = tuple(
            (c.to_query_entity(entity_dict, is_root=False) for c in self.dependents)
        )
        return head.with_children(dependents)


def _build_symbol_template(group, features):
    """Builds a template for a symbol in a feature CFG.

    Args:
        group (str): The group the template is for
        features (iterable): The names of features which should be included in
            the template

    Example:
    >>> _build_symbol_template('Group', {'feat1', 'feat2'})
    "Group[feat1={feat1}, feat2={feat2}]"

    """
    symbol_template = group
    for feature in features:
        if symbol_template is group:
            symbol_template += "["
        else:
            symbol_template += ", "
        symbol_template += "{0}={{{0}}}".format(feature)
    if symbol_template is not group:
        symbol_template += "]"
    return symbol_template


def _generate_dependent_rules(dep_type, config, symbol_template, features, head_types):
    """Generates the rules for a dependent entity

    Args:
        config (dict): A dictionary containing the configuration for this dependent
        symbol_template (str): A symbol template
        features (iterable): A list of features for this symbol
        head_types (set): All symbols which have dependents

    Yields:
        str: A rule for the dependent
    """
    # If dependent is a group, its symbol should be capitalized
    dep_symbol = dep_type.capitalize() if dep_type in head_types else dep_type

    max_instances = config.get("max_instances")
    if max_instances is None:
        # pass through features unchanged
        lhs = symbol_template.format(
            **{f: "?" + chr(ord("a") + i) for i, f in enumerate(features)}
        )
        rhs = lhs
        if config.get("left"):
            yield "{lhs} -> {dep} {rhs}".format(lhs=lhs, rhs=rhs, dep=dep_symbol)
        if config.get("right"):
            yield "{lhs} -> {rhs} {dep}".format(lhs=lhs, rhs=rhs, dep=dep_symbol)
    else:
        for dep_count in range(max_instances):
            feature_dict = {
                f: "?" + chr(ord("a") + i)
                for i, f in enumerate(features)
                if f is not dep_type
            }
            feature_dict[dep_type] = dep_count
            rhs = symbol_template.format(**feature_dict)
            feature_dict[dep_type] = dep_count + 1
            lhs = symbol_template.format(**feature_dict)

            if config.get("left"):
                yield "{lhs} -> {dep} {rhs}".format(lhs=lhs, rhs=rhs, dep=dep_symbol)
            if config.get("right"):
                yield "{lhs} -> {rhs} {dep}".format(lhs=lhs, rhs=rhs, dep=dep_symbol)


[docs]def generate_grammar(config, entity_types=None, relaxed=False, unique_entities=20):
    """Generates a feature context free grammar from the provided parser config.

    Args:
        config (dict): The parser configuration
        unique_entities (int, optional): The number of entities of the same type that should be
            permitted in the same query

    Returns:
        str: a string containing the grammar with rules separated by line
    """
    entity_types = set(entity_types or ())
    # start rules
    rules = [
        "{} -> {}".format(START_SYMBOL, HEAD_SYMBOL),  # The start rule
        "{0} -> {0} {0}".format(HEAD_SYMBOL),
    ]  # Allow multiple heads

    # the set of all heads
    head_types = set(config.keys())

    # the set of all dependents
    dependent_types = set((t for g in config.values() for t in g))

    all_types = head_types.union(dependent_types).union(entity_types)

    for entity in all_types:
        if entity not in head_types and entity not in dependent_types:
            # Add entities which are not mentioned in config as standalones
            rules.append("H -> {}".format(entity))
        elif relaxed and entity not in head_types and entity in dependent_types:
            # Add dependent entities as standalones in relaxed mode
            rules.append("H -> {}".format(entity))

    # create rules for each group
    for entity in head_types:
        # the symbol for a group is the capitalized version of the string
        group = entity.capitalize()
        rules.append("H -> {}".format(group))

        dep_configs = config[entity]
        # If a dependent has a max number of instances, we will track it as a feature
        features = [
            t for t, d in dep_configs.items() if d.get("max_instances") is not None
        ]

        symbol_template = _build_symbol_template(group, features)

        # basic rule with features initialized to 0
        rules.append(
            "{} -> {}".format(
                symbol_template.format(**{f: 0 for f in features}), entity
            )
        )

        for dep_type, dep_config in dep_configs.items():
            rules.extend(
                _generate_dependent_rules(
                    dep_type, dep_config, symbol_template, features, head_types
                )
            )
    for entity in all_types:
        for idx in range(unique_entities):
            rules.append("{0} -> '{0}{1}'".format(entity, idx))

    return "\n".join(rules)