Source code for mindmeld.tokenizer

# -*- coding: utf-8 -*-
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module contains the tokenizer."""

import codecs
import logging
import re
import sre_constants

from .components._config import get_tokenizer_config
from .constants import CURRENCY_SYMBOLS
from .path import ASCII_FOLDING_DICT_PATH

logger = logging.getLogger(__name__)

[docs]class Tokenizer: """The Tokenizer class encapsulates all the functionality for normalizing and tokenizing a given piece of text.""" _ASCII_CUTOFF = ord("\u0080") def __init__(self, app_path=None, exclude_from_norm=None): """Initializes the tokenizer. Args: exclude_from_norm (optional) - list of chars to exclude from normalization """ self.ascii_folding_table = self.load_ascii_folding_table() self.exclude_from_norm = exclude_from_norm or [] self.config = get_tokenizer_config(app_path, self.exclude_from_norm) self._custom = False self._init_regex() def _init_regex(self): """ Initialize the regex for matching and tokenizing text. """ # List of regex's for matching and tokenizing when keep_special_chars=False regex_list = [] letter_pattern_str = "[^\W\d_]+" # noqa: W605 to_exclude = CURRENCY_SYMBOLS + "".join(self.exclude_from_norm) # Make regex list regex_list.append("?P<start>^[^\w\d&" + to_exclude + "]+") # noqa: W605 regex_list.append("?P<end>[^\w\d&" + to_exclude + "]+$") # noqa: W605 regex_list.append( "?P<pattern1>(?P<pattern1_replace>" + letter_pattern_str + ")" # noqa: W605 + "[^\w\d\s&]+(?=[\d]+)" # noqa: W605 ) regex_list.append( "?P<pattern2>(?P<pattern2_replace>[\d]+)[^\w\d\s&]+(?=" # noqa: W605 + letter_pattern_str + ")" ) regex_list.append( "?P<pattern3>(?P<pattern3_replace>" + letter_pattern_str + ")" + "[^\w\d\s&]+(?=" # noqa: W605 + letter_pattern_str + ")" ) regex_list.append("?P<underscore>_") # noqa: W605 regex_list.append("?P<begspace>^\s+") # noqa: W605 regex_list.append("?P<trailspace>\s+$") # noqa: W605 regex_list.append("?P<spaceplus>\s+") # noqa: W605 regex_list.append("?P<apos_space> '|' ") # noqa: W605 regex_list.append("?P<apos_s>(?<=[^\\s])'[sS]") # noqa: W605 # handle the apostrophes used at the end of a possessive form, e.g. dennis' regex_list.append("?P<apos_poss>(^'(?=\S)|(?<=\S)'$)") # noqa: W605 # Replace lookup based on regex self.replace_lookup = { "apos_s": (" 's", None), "apos_poss": ("", None), "apos_space": (" ", None), "begspace": ("", None), "end": (" ", None), "escape1": ("{0}", "escape1_replace"), "escape2": ("{0} ", "escape2_replace"), "pattern1": ("{0} ", "pattern1_replace"), "pattern2": ("{0} ", "pattern2_replace"), "pattern3": ("{0} ", "pattern3_replace"), "spaceplus": (" ", None), "start": (" ", None), "trailspace": ("", None), "underscore": (" ", None), "apostrophe": (" ", None), } # Check if custom pattern is being used or MM defined if self.config.get("allowed_patterns"): self._custom = True # Create compiled regex expressions combined_re = ")|(".join( self.config["allowed_patterns"] or self.config["default_allowed_patterns"] ) try: self.keep_special_compiled = re.compile( "(%s)" % (combined_re,), re.UNICODE, ) except sre_constants.error: logger.error( "Regex compilation failed for the following patterns: %s", combined_re, ) self.compiled = re.compile("(%s)" % ")|(".join(regex_list), re.UNICODE) # Needed for train-roles where queries are deep copied (and thus tokenizer). # Pre compiled patterns don't deepcopy natively. Bug introduced past python 2.5 # TODO investigate necessity of deepcopy in train-roles def __deepcopy__(self, memo): # TODO: optimize this return Tokenizer(exclude_from_norm=self.exclude_from_norm)
[docs] @staticmethod def load_ascii_folding_table(): """ Load mapping of ascii code points to ascii characters. """ logger.debug( "Loading ascii folding mapping from file: %s.", ASCII_FOLDING_DICT_PATH ) ascii_folding_table = {} with ASCII_FOLDING_DICT_PATH, "r", encoding="unicode_escape" ) as mapping_file: for line in mapping_file: tokens = line.split() codepoint = tokens[0] ascii_char = tokens[1] ascii_folding_table[ord(codepoint)] = ascii_char return ascii_folding_table
def _one_xlat(self, match_object): """ Helper function for for multiple replace. Takes match object and looks up replacement. Args: match_object: A regex match object Returns: str: A string with punctuation replaced/removed """ replace_str, format_str = self.replace_lookup[match_object.lastgroup] if format_str: replace_str = replace_str.format(match_object.groupdict()[format_str]) return replace_str
[docs] def multiple_replace(self, text, compiled): """ Takes text and compiled regex pattern, does lookup for multi rematch. Args: text (str): The text to perform matching on compiled: A compiled regex object that can be used for matching Returns: str: The text with replacement specified by self.replace_lookup """ # For each match, look-up corresponding value in dictionary try: # Checks if replacement can be found in pre-defined match object (non-custom). # If no key in match object, go to custom tokenizer handling in Exception. filtered = compiled.sub(self._one_xlat, text) # If no key error and custom tokenizer was involved # then the token has unwanted special characters. Remove them and return. if self._custom: return self.compiled.sub(self._one_xlat, text) # Return filtered list if non-custom tokenizer. return filtered except KeyError: # In case of custom/app-specific tokenizer configuration"Using custom tokenizer configuration.") re_str = compiled.findall(text) # For the custom regex pattern, the following first filters the list of matches to # only keep the non-NULL matches. The filtered object is converted to a list and the # first matching object is selected. return "".join([list(filter(None, e))[0] for e in re_str])
[docs] def normalize(self, text, keep_special_chars=True): """ Normalize a given text string and return the string with each token normalized. Args: text (str): The text to normalize keep_special_chars (bool): If True, the tokenizer excludes a list of special characters used in annotations Returns: str: the original text string with each token in normalized form """ norm_tokens = self.tokenize(text, keep_special_chars) normalized_text = " ".join(t["entity"] for t in norm_tokens) return normalized_text
[docs] def tokenize(self, text, keep_special_chars=True): """Tokenizes the input text, normalizes the token text, and returns normalized tokens. Currently it does the following during normalization: 1. remove leading special characters except dollar sign and ampersand 2. remove trailing special characters except ampersand 3. remove special characters except ampersand when the preceding character is a letter and the following characters is a number 4. remove special characters except ampersand when the preceding character is a number and the following character is a letter 5. remove special characters except ampersand when both preceding and following characters are letters 6. remove special character except ampersand when the following character is '|' 7. remove diacritics and replace it with equivalent ascii character when possible Note that the tokenizer also excludes a list of special characters used in annotations when the flag keep_special_chars is set to True Args: text (str): The text to normalize keep_special_chars (bool): If True, the tokenizer excludes a list of special characters used in annotations Returns: list: A list of normalized tokens """ raw_tokens = self.tokenize_raw(text) norm_tokens = [] for i, raw_token in enumerate(raw_tokens): if not raw_token["text"] or len(raw_token["text"]) == 0: continue norm_token_start = len(norm_tokens) norm_token_text = raw_token["text"] if keep_special_chars: norm_token_text = self.multiple_replace( norm_token_text, self.keep_special_compiled ) else: norm_token_text = self.multiple_replace(norm_token_text, self.compiled) # fold to ascii norm_token_text = self.fold_str_to_ascii(norm_token_text) norm_token_text = norm_token_text.lower() norm_token_count = 0 if len(norm_token_text) > 0: # remove diacritics and fold the character to equivalent ascii character if possible for token in norm_token_text.split(): norm_token = {} norm_token["entity"] = token norm_token["raw_entity"] = raw_token["text"] norm_token["raw_token_index"] = i norm_token["raw_start"] = raw_token["start"] norm_tokens.append(norm_token) norm_token_count += 1 raw_token["norm_token_start"] = norm_token_start raw_token["norm_token_count"] = norm_token_count return norm_tokens
[docs] @staticmethod def tokenize_raw(text): """ Identify tokens in text and create normalized tokens that contain the text and start index. Args: text (str): The text to normalize Returns: list: A list of normalized tokens """ tokens = [] token = {} token_text = "" for i, char in enumerate(text): if char.isspace(): if token and token_text: token["text"] = token_text tokens.append(token) token = {} token_text = "" continue if not token_text: token = {"start": i} token_text += char if token and token_text: token["text"] = token_text tokens.append(token) return tokens
[docs] def get_char_index_map(self, raw_text, normalized_text): """ Generates character index mapping from normalized query to raw query. The entity model always operates on normalized query during NLP processing but for entity output we need to generate indexes based on raw query. The mapping is generated by calculating edit distance and backtracking to get the proper alignment. Args: raw_text (str): Raw query text. normalized_text (str): Normalized query text. Returns: dict: A mapping of character indexes from normalized query to raw query. """ text = raw_text.lower() text = self.fold_str_to_ascii(text) m = len(raw_text) n = len(normalized_text) # handle case where normalized text is the empty string if n == 0: raw_to_norm_mapping = {i: 0 for i in range(m)} return raw_to_norm_mapping, {0: 0} # handle case where normalized text and raw text are identical if m == n and raw_text == normalized_text: mapping = {i: i for i in range(n)} return mapping, mapping edit_dis = [] for i in range(0, n + 1): edit_dis.append([0] * (m + 1)) edit_dis[0] = list(range(0, m + 1)) for i in range(0, n + 1): edit_dis[i][0] = i directions = [] for i in range(0, n + 1): directions.append([""] * (m + 1)) for i in range(1, n + 1): for j in range(1, m + 1): dis = 999 direction = None diag_dis = edit_dis[i - 1][j - 1] if normalized_text[i - 1] != text[j - 1]: diag_dis += 1 # dis from going down down_dis = edit_dis[i - 1][j] + 1 # dis from going right right_dis = edit_dis[i][j - 1] + 1 if down_dis < dis: dis = down_dis direction = "↓" if right_dis < dis: dis = right_dis direction = "→" if diag_dis < dis: dis = diag_dis direction = "↘" edit_dis[i][j] = dis directions[i][j] = direction mapping = {} # backtrack m_idx = m n_idx = n while m_idx > 0 and n_idx > 0: if directions[n_idx][m_idx] == "↘": mapping[n_idx - 1] = m_idx - 1 m_idx -= 1 n_idx -= 1 elif directions[n_idx][m_idx] == "→": m_idx -= 1 elif directions[n_idx][m_idx] == "↓": n_idx -= 1 # initialize the forward mapping (raw to normalized text) raw_to_norm_mapping = {0: 0} # naive approach for generating forward mapping. this is naive and probably not robust. # all leading special characters will get mapped to index position 0 in normalized text. raw_to_norm_mapping.update({v: k for k, v in mapping.items()}) for i in range(0, m): if i not in raw_to_norm_mapping: raw_to_norm_mapping[i] = raw_to_norm_mapping[i - 1] return raw_to_norm_mapping, mapping
[docs] def fold_char_to_ascii(self, char): """ Return the ASCII character corresponding to the folding token. Args: char: ASCII folding token Returns: char: a ASCII character """ char_ord = ord(char) if char_ord < self._ASCII_CUTOFF: return char try: return self.ascii_folding_table[char_ord] except KeyError: return char
[docs] def fold_str_to_ascii(self, text): """ Return the ASCII character corresponding to the folding token string. Args: str: ASCII folding token string Returns: char: a ASCII character """ folded_str = "" for char in text: folded_str += self.fold_char_to_ascii(char) return folded_str
def __repr__(self): return "<Tokenizer exclude_from_norm: {}>".format( self.exclude_from_norm.__repr__() )
[docs] @staticmethod def create_tokenizer(app_path=None): """Creates the tokenizer for the app Args: app_path (str, optional): MindMeld Application Path Returns: Tokenizer: a tokenizer """ return Tokenizer(app_path=app_path)