# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains the tokenizer."""
import codecs
import logging
import re
import sre_constants
from .components._config import get_tokenizer_config
from .constants import CURRENCY_SYMBOLS
from .path import ASCII_FOLDING_DICT_PATH
logger = logging.getLogger(__name__)
[docs]class Tokenizer:
"""The Tokenizer class encapsulates all the functionality for normalizing and tokenizing a
given piece of text."""
_ASCII_CUTOFF = ord("\u0080")
def __init__(self, app_path=None, exclude_from_norm=None):
"""Initializes the tokenizer.
Args:
exclude_from_norm (optional) - list of chars to exclude from normalization
"""
self.ascii_folding_table = self.load_ascii_folding_table()
self.exclude_from_norm = exclude_from_norm or []
self.config = get_tokenizer_config(app_path, self.exclude_from_norm)
self._custom = False
self._init_regex()
def _init_regex(self):
"""
Initialize the regex for matching and tokenizing text.
"""
# List of regex's for matching and tokenizing when keep_special_chars=False
regex_list = []
letter_pattern_str = "[^\W\d_]+" # noqa: W605
to_exclude = CURRENCY_SYMBOLS + "".join(self.exclude_from_norm)
# Make regex list
regex_list.append("?P<start>^[^\w\d&" + to_exclude + "]+") # noqa: W605
regex_list.append("?P<end>[^\w\d&" + to_exclude + "]+$") # noqa: W605
regex_list.append(
"?P<pattern1>(?P<pattern1_replace>"
+ letter_pattern_str
+ ")" # noqa: W605
+ "[^\w\d\s&]+(?=[\d]+)" # noqa: W605
)
regex_list.append(
"?P<pattern2>(?P<pattern2_replace>[\d]+)[^\w\d\s&]+(?=" # noqa: W605
+ letter_pattern_str
+ ")"
)
regex_list.append(
"?P<pattern3>(?P<pattern3_replace>"
+ letter_pattern_str
+ ")"
+ "[^\w\d\s&]+(?=" # noqa: W605
+ letter_pattern_str
+ ")"
)
regex_list.append("?P<underscore>_") # noqa: W605
regex_list.append("?P<begspace>^\s+") # noqa: W605
regex_list.append("?P<trailspace>\s+$") # noqa: W605
regex_list.append("?P<spaceplus>\s+") # noqa: W605
regex_list.append("?P<apos_space> '|' ") # noqa: W605
regex_list.append("?P<apos_s>(?<=[^\\s])'[sS]") # noqa: W605
# handle the apostrophes used at the end of a possessive form, e.g. dennis'
regex_list.append("?P<apos_poss>(^'(?=\S)|(?<=\S)'$)") # noqa: W605
# Replace lookup based on regex
self.replace_lookup = {
"apos_s": (" 's", None),
"apos_poss": ("", None),
"apos_space": (" ", None),
"begspace": ("", None),
"end": (" ", None),
"escape1": ("{0}", "escape1_replace"),
"escape2": ("{0} ", "escape2_replace"),
"pattern1": ("{0} ", "pattern1_replace"),
"pattern2": ("{0} ", "pattern2_replace"),
"pattern3": ("{0} ", "pattern3_replace"),
"spaceplus": (" ", None),
"start": (" ", None),
"trailspace": ("", None),
"underscore": (" ", None),
"apostrophe": (" ", None),
}
# Check if custom pattern is being used or MM defined
if self.config.get("allowed_patterns"):
self._custom = True
# Create compiled regex expressions
combined_re = ")|(".join(
self.config["allowed_patterns"] or self.config["default_allowed_patterns"]
)
try:
self.keep_special_compiled = re.compile(
"(%s)" % (combined_re,), re.UNICODE,
)
except sre_constants.error:
logger.error(
"Regex compilation failed for the following patterns: %s",
combined_re,
)
self.compiled = re.compile("(%s)" % ")|(".join(regex_list), re.UNICODE)
# Needed for train-roles where queries are deep copied (and thus tokenizer).
# Pre compiled patterns don't deepcopy natively. Bug introduced past python 2.5
# TODO investigate necessity of deepcopy in train-roles
def __deepcopy__(self, memo):
# TODO: optimize this
return Tokenizer(exclude_from_norm=self.exclude_from_norm)
[docs] @staticmethod
def load_ascii_folding_table():
"""
Load mapping of ascii code points to ascii characters.
"""
logger.debug(
"Loading ascii folding mapping from file: %s.", ASCII_FOLDING_DICT_PATH
)
ascii_folding_table = {}
with codecs.open(
ASCII_FOLDING_DICT_PATH, "r", encoding="unicode_escape"
) as mapping_file:
for line in mapping_file:
tokens = line.split()
codepoint = tokens[0]
ascii_char = tokens[1]
ascii_folding_table[ord(codepoint)] = ascii_char
return ascii_folding_table
def _one_xlat(self, match_object):
"""
Helper function for for multiple replace. Takes match object and looks up replacement.
Args:
match_object: A regex match object
Returns:
str: A string with punctuation replaced/removed
"""
replace_str, format_str = self.replace_lookup[match_object.lastgroup]
if format_str:
replace_str = replace_str.format(match_object.groupdict()[format_str])
return replace_str
[docs] def multiple_replace(self, text, compiled):
"""
Takes text and compiled regex pattern, does lookup for multi rematch.
Args:
text (str): The text to perform matching on
compiled: A compiled regex object that can be used for matching
Returns:
str: The text with replacement specified by self.replace_lookup
"""
# For each match, look-up corresponding value in dictionary
try:
# Checks if replacement can be found in pre-defined match object (non-custom).
# If no key in match object, go to custom tokenizer handling in Exception.
filtered = compiled.sub(self._one_xlat, text)
# If no key error and custom tokenizer was involved
# then the token has unwanted special characters. Remove them and return.
if self._custom:
return self.compiled.sub(self._one_xlat, text)
# Return filtered list if non-custom tokenizer.
return filtered
except KeyError:
# In case of custom/app-specific tokenizer configuration
logger.info("Using custom tokenizer configuration.")
re_str = compiled.findall(text)
# For the custom regex pattern, the following first filters the list of matches to
# only keep the non-NULL matches. The filtered object is converted to a list and the
# first matching object is selected.
return "".join([list(filter(None, e))[0] for e in re_str])
[docs] def normalize(self, text, keep_special_chars=True):
"""
Normalize a given text string and return the string with each token normalized.
Args:
text (str): The text to normalize
keep_special_chars (bool): If True, the tokenizer excludes a list of special
characters used in annotations
Returns:
str: the original text string with each token in normalized form
"""
norm_tokens = self.tokenize(text, keep_special_chars)
normalized_text = " ".join(t["entity"] for t in norm_tokens)
return normalized_text
[docs] def tokenize(self, text, keep_special_chars=True):
"""Tokenizes the input text, normalizes the token text, and returns normalized tokens.
Currently it does the following during normalization:
1. remove leading special characters except dollar sign and ampersand
2. remove trailing special characters except ampersand
3. remove special characters except ampersand when the preceding character is a letter and
the following characters is a number
4. remove special characters except ampersand when the preceding character is a number and
the following character is a letter
5. remove special characters except ampersand when both preceding and following characters
are letters
6. remove special character except ampersand when the following character is '|'
7. remove diacritics and replace it with equivalent ascii character when possible
Note that the tokenizer also excludes a list of special characters used in annotations when
the flag keep_special_chars is set to True
Args:
text (str): The text to normalize
keep_special_chars (bool): If True, the tokenizer excludes a list of special
characters used in annotations
Returns:
list: A list of normalized tokens
"""
raw_tokens = self.tokenize_raw(text)
norm_tokens = []
for i, raw_token in enumerate(raw_tokens):
if not raw_token["text"] or len(raw_token["text"]) == 0:
continue
norm_token_start = len(norm_tokens)
norm_token_text = raw_token["text"]
if keep_special_chars:
norm_token_text = self.multiple_replace(
norm_token_text, self.keep_special_compiled
)
else:
norm_token_text = self.multiple_replace(norm_token_text, self.compiled)
# fold to ascii
norm_token_text = self.fold_str_to_ascii(norm_token_text)
norm_token_text = norm_token_text.lower()
norm_token_count = 0
if len(norm_token_text) > 0:
# remove diacritics and fold the character to equivalent ascii character if possible
for token in norm_token_text.split():
norm_token = {}
norm_token["entity"] = token
norm_token["raw_entity"] = raw_token["text"]
norm_token["raw_token_index"] = i
norm_token["raw_start"] = raw_token["start"]
norm_tokens.append(norm_token)
norm_token_count += 1
raw_token["norm_token_start"] = norm_token_start
raw_token["norm_token_count"] = norm_token_count
return norm_tokens
[docs] @staticmethod
def tokenize_raw(text):
"""
Identify tokens in text and create normalized tokens that contain the text and start index.
Args:
text (str): The text to normalize
Returns:
list: A list of normalized tokens
"""
tokens = []
token = {}
token_text = ""
for i, char in enumerate(text):
if char.isspace():
if token and token_text:
token["text"] = token_text
tokens.append(token)
token = {}
token_text = ""
continue
if not token_text:
token = {"start": i}
token_text += char
if token and token_text:
token["text"] = token_text
tokens.append(token)
return tokens
[docs] def get_char_index_map(self, raw_text, normalized_text):
"""
Generates character index mapping from normalized query to raw query. The entity model
always operates on normalized query during NLP processing but for entity output we need
to generate indexes based on raw query.
The mapping is generated by calculating edit distance and backtracking to get the
proper alignment.
Args:
raw_text (str): Raw query text.
normalized_text (str): Normalized query text.
Returns:
dict: A mapping of character indexes from normalized query to raw query.
"""
text = raw_text.lower()
text = self.fold_str_to_ascii(text)
m = len(raw_text)
n = len(normalized_text)
# handle case where normalized text is the empty string
if n == 0:
raw_to_norm_mapping = {i: 0 for i in range(m)}
return raw_to_norm_mapping, {0: 0}
# handle case where normalized text and raw text are identical
if m == n and raw_text == normalized_text:
mapping = {i: i for i in range(n)}
return mapping, mapping
edit_dis = []
for i in range(0, n + 1):
edit_dis.append([0] * (m + 1))
edit_dis[0] = list(range(0, m + 1))
for i in range(0, n + 1):
edit_dis[i][0] = i
directions = []
for i in range(0, n + 1):
directions.append([""] * (m + 1))
for i in range(1, n + 1):
for j in range(1, m + 1):
dis = 999
direction = None
diag_dis = edit_dis[i - 1][j - 1]
if normalized_text[i - 1] != text[j - 1]:
diag_dis += 1
# dis from going down
down_dis = edit_dis[i - 1][j] + 1
# dis from going right
right_dis = edit_dis[i][j - 1] + 1
if down_dis < dis:
dis = down_dis
direction = "↓"
if right_dis < dis:
dis = right_dis
direction = "→"
if diag_dis < dis:
dis = diag_dis
direction = "↘"
edit_dis[i][j] = dis
directions[i][j] = direction
mapping = {}
# backtrack
m_idx = m
n_idx = n
while m_idx > 0 and n_idx > 0:
if directions[n_idx][m_idx] == "↘":
mapping[n_idx - 1] = m_idx - 1
m_idx -= 1
n_idx -= 1
elif directions[n_idx][m_idx] == "→":
m_idx -= 1
elif directions[n_idx][m_idx] == "↓":
n_idx -= 1
# initialize the forward mapping (raw to normalized text)
raw_to_norm_mapping = {0: 0}
# naive approach for generating forward mapping. this is naive and probably not robust.
# all leading special characters will get mapped to index position 0 in normalized text.
raw_to_norm_mapping.update({v: k for k, v in mapping.items()})
for i in range(0, m):
if i not in raw_to_norm_mapping:
raw_to_norm_mapping[i] = raw_to_norm_mapping[i - 1]
return raw_to_norm_mapping, mapping
[docs] def fold_char_to_ascii(self, char):
"""
Return the ASCII character corresponding to the folding token.
Args:
char: ASCII folding token
Returns:
char: a ASCII character
"""
char_ord = ord(char)
if char_ord < self._ASCII_CUTOFF:
return char
try:
return self.ascii_folding_table[char_ord]
except KeyError:
return char
[docs] def fold_str_to_ascii(self, text):
"""
Return the ASCII character corresponding to the folding token string.
Args:
str: ASCII folding token string
Returns:
char: a ASCII character
"""
folded_str = ""
for char in text:
folded_str += self.fold_char_to_ascii(char)
return folded_str
def __repr__(self):
return "<Tokenizer exclude_from_norm: {}>".format(
self.exclude_from_norm.__repr__()
)
[docs] @staticmethod
def create_tokenizer(app_path=None):
"""Creates the tokenizer for the app
Args:
app_path (str, optional): MindMeld Application Path
Returns:
Tokenizer: a tokenizer
"""
return Tokenizer(app_path=app_path)