Source code for mindmeld.text_preparation.normalizers

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This module contains Normalizers."""
from abc import ABC, abstractmethod
import codecs
import logging
import re
import unicodedata
from ..constants import CURRENCY_SYMBOLS
from ..path import ASCII_FOLDING_DICT_PATH

logger = logging.getLogger(__name__)

ASCII_CUTOFF = ord("\u0080")


[docs]class Normalizer(ABC):
    """Abstract Normalizer Base Class."""

    def __init__(self):
        """Creates a Normalizer instance."""

[docs]    @abstractmethod
    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        raise NotImplementedError("Subclasses must implement this method")

[docs]    def tojson(self):
        """
        Method defined to obtain recursive JSON representation of a TextPreparationPipeline.

        Args:
            None.

        Returns:
            JSON representation of Preprocessor (dict) .
        """
        return {self.__class__.__name__: None}


[docs]class NoOpNormalizer(Normalizer):
    """A No-Ops Normalizer."""

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Returns the original text.
        """
        return text


[docs]class ASCIIFold(Normalizer):
    """An ASCII Folding Normalizer."""

    def __init__(self):
        super().__init__()
        self.ascii_folding_table = self.load_ascii_folding_table()

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return self.fold_str_to_ascii(text)

[docs]    def fold_char_to_ascii(self, char):
        """
        Return the ASCII character corresponding to the folding token.

        Args:
            char: ASCII folding token

        Returns:
            char: a ASCII character
        """
        char_ord = ord(char)
        if char_ord < ASCII_CUTOFF:
            return char

        try:
            return self.ascii_folding_table[char_ord]
        except KeyError:
            return char

[docs]    def fold_str_to_ascii(self, text):
        """
        Return the ASCII character corresponding to the folding token string.

        Args:
            str: ASCII folding token string

        Returns:
            char: a ASCII character
        """
        folded_str = ""
        for char in text:
            folded_str += self.fold_char_to_ascii(char)

        return folded_str

[docs]    @staticmethod
    def load_ascii_folding_table():
        """
        Load mapping of ascii code points to ascii characters.
        """
        logger.debug(
            "Loading ascii folding mapping from file: %s.", ASCII_FOLDING_DICT_PATH
        )
        ascii_folding_table = {}
        with codecs.open(
            ASCII_FOLDING_DICT_PATH, "r", encoding="unicode_escape"
        ) as mapping_file:
            for line in mapping_file:
                codepoint, ascii_char = line.split()
                ascii_folding_table[ord(codepoint)] = ascii_char

        return ascii_folding_table


[docs]class NFD(Normalizer):
    """Unicode NFD Normalizer Class. (Canonical Decomposition)

    For more details: https://unicode.org/reports/tr15/#Norm_Forms
    """

    def __init__(self):
        """Creates a NFD Normalizer instance."""
        super().__init__()
        self.normalization_type = "NFD"

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return unicodedata.normalize(self.normalization_type, text)


[docs]class NFC(Normalizer):
    """Unicode NFC Normalizer Class.
    (Canonical Decomposition, followed by Canonical Composition)

    For more details: https://unicode.org/reports/tr15/#Norm_Forms
    """

    def __init__(self):
        """Creates a NFC Normalizer instance."""
        super().__init__()
        self.normalization_type = "NFC"

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return unicodedata.normalize(self.normalization_type, text)


[docs]class NFKD(Normalizer):
    """Unicode NFKD Normalizer Class. (Compatibility Decomposition)

    For more details: https://unicode.org/reports/tr15/#Norm_Forms
    """

    def __init__(self):
        """Creates a NFKD Normalizer instance."""
        super().__init__()
        self.normalization_type = "NFKD"

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return unicodedata.normalize(self.normalization_type, text)


[docs]class NFKC(Normalizer):
    """Unicode NFKC Normalizer Class.
    (Compatibility Decomposition, followed by Canonical Composition)

    For more details: https://unicode.org/reports/tr15/#Norm_Forms
    """

    def __init__(self):
        """Creates a NFKC Normalizer instance."""
        super().__init__()
        self.normalization_type = "NFKC"

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return unicodedata.normalize(self.normalization_type, text)


[docs]class Lowercase(Normalizer):
    """Lowercase Normalizer Class."""

[docs]    def normalize(self, text):
        """
        Args:
            text (str): Input text.
        Returns:
            normalized_text (str): Normalized Text.
        """
        return text.lower()


[docs]class RegexNormalizerRule(Normalizer):
    def __init__(self, pattern: str, replacement: str):
        """Creates a RegexNormalizerRule instance."""
        self.pattern = pattern
        self.replacement = replacement
        self._expr = re.compile(self.pattern)

[docs]    def normalize(self, s):
        return self._expr.sub(self.replacement, s)

[docs]    def tojson(self):
        return {self.__class__.__name__ + "##" + self.pattern + "##" + self.replacement: None}


[docs]class RegexNormalizerRuleFactory:
    # exception_chars is a class var so that updates are accessible throughout the application
    EXCEPTION_CHARS = r"\@\[\]'"

[docs]    @staticmethod
    def get_default_regex_normalizer_rule(regex_normalizer: str):
        """Creates a RegexNormalizerRule object based on the given rule and the current
        EXCEPTION_CHARS.

        Args:
            regex_normalizer (str): Name of the desired RegexNormalizerRule
        Returns:
            (RegexNormalizerRule): Default Regex Normalizer Rule
        """
        if regex_normalizer in DEFAULT_REGEX_NORM_RULES:
            regex_rule_dict = DEFAULT_REGEX_NORM_RULES[regex_normalizer]
            # Inserts current EXCEPTION_CHARS in pattern string if applicable
            regex_rule_dict["pattern"] = regex_rule_dict["pattern"].format(
                exception_chars=RegexNormalizerRuleFactory.EXCEPTION_CHARS
            )
            return RegexNormalizerRule(**regex_rule_dict)

[docs]    @staticmethod
    def get_regex_normalizers(regex_norm_rules):
        """A static method to get a RegexNormalizerRule from regex_norm_rules.

        Args:
            regex_norm_rules (List[Dict], optional): Regex normalization rules represented as
                dictionaries. The example rule below removes any text in parentheses.
                {
                    "pattern": "\(.+?\)",
                    "replacement": ""
                }
        Returns:
            regex_normalizer_rules (List[RegexNormalizerRule]): List of RegexNormalizerRule ojects
                created from the regex_norm_rules_provided.
        """
        return [
            RegexNormalizerRule(pattern=r["pattern"], replacement=r["replacement"])
            for r in regex_norm_rules
        ]


DEFAULT_REGEX_NORM_RULES = {
    "RemoveAposAtEndOfPossesiveForm": {
        "pattern": r"^'(?=\S)|(?<=\S)'$",
        "replacement": "",
    },
    "RemoveAdjacentAposAndSpace": {"pattern": r" '|' ", "replacement": ""},
    "RemoveBeginningSpace": {"pattern": r"^\s+", "replacement": ""},
    "RemoveTrailingSpace": {"pattern": r"\s+$", "replacement": ""},
    "ReplaceSpacesWithSpace": {"pattern": r"\s+", "replacement": " "},
    "ReplaceUnderscoreWithSpace": {"pattern": r"_", "replacement": " "},
    "SeparateAposS": {"pattern": r"(?<=[^\s])'[sS]", "replacement": " 's"},
    "ReplacePunctuationAtWordStartWithSpace": {
        "pattern": r"^[^\w\d&" + CURRENCY_SYMBOLS + "{exception_chars}" + r"]+",
        "replacement": " ",
    },
    "ReplacePunctuationAtWordEndWithSpace": {
        "pattern": r"[^\w\d&" + CURRENCY_SYMBOLS + "{exception_chars}" + r"]+$",
        "replacement": " ",
    },
    "ReplaceSpecialCharsBetweenLettersAndDigitsWithSpace": {
        "pattern": r"(?<=[^\W\d_])[^\w\d\s&" + "{exception_chars}" + r"]+(?=[\d]+)",
        "replacement": " ",
    },
    "ReplaceSpecialCharsBetweenDigitsAndLettersWithSpace": {
        "pattern": r"(?<=[\d])[^\w\d\s&" + "{exception_chars}" + r"]+(?=[^\W\d_]+)",
        "replacement": " ",
    },
    "ReplaceSpecialCharsBetweenLettersWithSpace": {
        "pattern": r"(?<=[^\W\d_])[^\w\d\s&" + "{exception_chars}" + r"]+(?=[^\W\d_]+)",
        "replacement": " ",
    },
}


[docs]class NormalizerFactory:
    """Normalizer Factory Class"""

[docs]    @staticmethod
    def get_normalizer(normalizer: str):
        """A static method to get a Normalizer

        Args:
            normalizer (str): Name of the desired Normalizer class
        Returns:
            (Normalizer): Normalizer Class
        """
        if normalizer in DEFAULT_REGEX_NORM_RULES:
            return RegexNormalizerRuleFactory.get_default_regex_normalizer_rule(
                normalizer
            )

        normalizer_classes = {
            NoOpNormalizer.__name__: NoOpNormalizer,
            ASCIIFold.__name__: ASCIIFold,
            NFC.__name__: NFC,
            NFD.__name__: NFD,
            NFKC.__name__: NFKC,
            NFKD.__name__: NFKD,
            Lowercase.__name__: Lowercase,
        }
        normalizer_class = normalizer_classes.get(normalizer)
        if not normalizer_class:
            raise TypeError(f"{normalizer} is not a valid Normalizer type.")
        return normalizer_class()