Source code for mindmeld.query_factory

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module contains the query factory class."""
from __future__ import absolute_import, unicode_literals

import logging

from .components._config import get_language_config
from .core import TEXT_FORM_NORMALIZED, TEXT_FORM_PROCESSED, TEXT_FORM_RAW, Query
from .system_entity_recognizer import (
    DucklingRecognizer,
    NoOpSystemEntityRecognizer,
    SystemEntityRecognizer,
)
from .text_preparation.text_preparation_pipeline import TextPreparationPipelineFactory

logger = logging.getLogger(__name__)


[docs]class QueryFactory: """An object which encapsulates the components required to create a Query object. Attributes: text_preparation_pipeline (TextPreparationPipeline): Pipeline class responsible for processing queries. language (str): the language of the text locale (str): the locale of the text system_entity_recognizer (SystemEntityRecognizer): default to NoOpSystemEntityRecognizer duckling (bool): if no system entity recognizer is provided, initialize a new Duckling recognizer instance. """ def __init__( self, text_preparation_pipeline=None, locale=None, language=None, system_entity_recognizer=None, duckling=False, ): self.text_preparation_pipeline = text_preparation_pipeline self.locale = locale self.language = language if system_entity_recognizer: self.system_entity_recognizer = system_entity_recognizer elif duckling: self.system_entity_recognizer = DucklingRecognizer.get_instance() else: logger.warning( "No System Entity Recognizer selected, set 'duckling=True' for DucklingRecognizer", ) self.system_entity_recognizer = NoOpSystemEntityRecognizer.get_instance()
[docs] def create_query( self, text, time_zone=None, timestamp=None, locale=None, language=None ): """Creates a query with the given text. Args: text (str): Text to create a query object for time_zone (str, optional): An IANA time zone id to create the query relative to. timestamp (int, optional): A reference unix timestamp to create the query relative to, in seconds. locale (str, optional): The locale representing the ISO 639-1 language code and \ ISO3166 alpha 2 country code separated by an underscore character. language (str, optional): Language as specified using a 639-1/2 code Returns: Query: A newly constructed query """ if not language and not locale: language = self.language locale = self.locale raw_text = text char_maps = {} # Step 1: Preprocessing if self.text_preparation_pipeline.custom_preprocessors_exist(): preprocessed_text = self.text_preparation_pipeline.preprocess(raw_text) ( forward_map, backward_map, ) = self.text_preparation_pipeline.get_char_index_map( raw_text, preprocessed_text ) char_maps[(TEXT_FORM_RAW, TEXT_FORM_PROCESSED)] = forward_map char_maps[(TEXT_FORM_PROCESSED, TEXT_FORM_RAW)] = backward_map else: preprocessed_text = raw_text # Step 2: Tokenization and Step 3: Normalization normalized_tokens = self.text_preparation_pipeline.tokenize_and_normalize(preprocessed_text) normalized_text = " ".join([t["entity"] for t in normalized_tokens]) # Step 4: Stemming stemmed_tokens = [ self.text_preparation_pipeline.stem_word(t["entity"]) for t in normalized_tokens ] # Create Normalized Maps ( normalization_forward_map, normalization_backward_map, ) = self.text_preparation_pipeline.get_char_index_map( preprocessed_text, normalized_text ) char_maps[ (TEXT_FORM_PROCESSED, TEXT_FORM_NORMALIZED) ] = normalization_forward_map char_maps[ (TEXT_FORM_NORMALIZED, TEXT_FORM_PROCESSED) ] = normalization_backward_map query = Query( raw_text, preprocessed_text, normalized_tokens, char_maps, locale=locale, language=language, time_zone=time_zone, timestamp=timestamp, stemmed_tokens=stemmed_tokens, ) query.system_entity_candidates = self.system_entity_recognizer.get_candidates( query, locale=locale, language=language ) return query
[docs] def normalize(self, text): """Normalizes the given text. Args: text (str): Text to process Returns: str: Normalized text """ return self.text_preparation_pipeline.normalize(text)
def __repr__(self): return "<{} id: {!r}>".format(self.__class__.__name__, id(self))
[docs] @staticmethod def create_query_factory( app_path, text_preparation_pipeline=None, system_entity_recognizer=None, duckling=False, ): """Creates a query factory for the application. Args: app_path (str, optional): The path to the directory containing the app's data. If None is passed, a default query factory will be returned. text_preparation_pipeline (TextPreparationPipeline, optional): Pipeline class responsible for processing queries. system_entity_recognizer (SystemEntityRecognizer): If not passed, we use either the one from the application's configuration or NoOpSystemEntityRecognizer. duckling (bool, optional): if no system entity recognizer is provided, initialize a new Duckling recognizer instance. Returns: QueryFactory: A QueryFactory object that is used to create Query objects. """ language, locale = get_language_config(app_path) if text_preparation_pipeline is None: text_preparation_pipeline = TextPreparationPipelineFactory.create_from_app_path( app_path ) if system_entity_recognizer: sys_entity_recognizer = system_entity_recognizer elif app_path: sys_entity_recognizer = SystemEntityRecognizer.load_from_app_path(app_path) elif duckling: sys_entity_recognizer = DucklingRecognizer.get_instance() else: logger.warning( "No System Entity Recognizer selected, set 'duckling=True' for DucklingRecognizer", ) sys_entity_recognizer = NoOpSystemEntityRecognizer.get_instance() return QueryFactory( text_preparation_pipeline=text_preparation_pipeline, language=language, locale=locale, system_entity_recognizer=sys_entity_recognizer, )