Source code for mindmeld.models.taggers.embeddings

# -*- coding: utf-8 -*-
#
# Copyright (c) 2015 Cisco Systems, Inc. and others.  All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import pickle

import numpy as np
from mindmeld.path import (
    PREVIOUSLY_USED_CHAR_EMBEDDINGS_FILE_PATH,
    PREVIOUSLY_USED_WORD_EMBEDDINGS_FILE_PATH,
)

from ..containers import GloVeEmbeddingsContainer

logger = logging.getLogger(__name__)

EMBEDDING_FILE_PATH_TEMPLATE = "glove.6B.{}d.txt"
ALLOWED_WORD_EMBEDDING_DIMENSIONS = [50, 100, 200, 300]


[docs]class WordSequenceEmbedding: """WordSequenceEmbedding encodes a sequence of words into a sequence of fixed dimension real-numbered vectors by mapping each word as a vector. """ def __init__( self, sequence_padding_length, token_embedding_dimension=None, token_pretrained_embedding_filepath=None, use_padding=True, ): """Initializes the WordSequenceEmbedding class Args: sequence_padding_length (int): padding length of the sequence after which the sequence is cut off token_embedding_dimension (int): The embedding dimension of the token token_pretrained_embedding_filepath (str): The embedding filepath to extract the embeddings from. """ self.token_embedding_dimension = token_embedding_dimension self.sequence_padding_length = sequence_padding_length self.token_to_embedding_mapping = GloVeEmbeddingsContainer( token_embedding_dimension, token_pretrained_embedding_filepath ).get_pretrained_word_to_embeddings_dict() self._add_historic_embeddings() self.use_padding = use_padding
[docs] def encode_sequence_of_tokens(self, token_sequence): """Encodes a sequence of tokens into real value vectors. Args: token_sequence (list): A sequence of tokens. Returns: (list): Encoded sequence of tokens. """ default_encoding = np.zeros(self.token_embedding_dimension) if self.use_padding: encoded_query = [default_encoding] * self.sequence_padding_length else: encoded_query = [default_encoding] * len(token_sequence) for idx, token in enumerate(token_sequence): if idx >= self.sequence_padding_length and self.use_padding: break encoded_query[idx] = self._encode_token(token) return encoded_query
def _encode_token(self, token): """Encodes a token to its corresponding embedding Args: token (str): Individual token Returns: corresponding embedding """ if token not in self.token_to_embedding_mapping: random_vector = np.random.uniform( -1, 1, size=(self.token_embedding_dimension,) ) self.token_to_embedding_mapping[token] = random_vector return self.token_to_embedding_mapping[token] def _add_historic_embeddings(self): historic_word_embeddings = {} # load historic word embeddings if os.path.exists(PREVIOUSLY_USED_WORD_EMBEDDINGS_FILE_PATH): pkl_file = open(PREVIOUSLY_USED_WORD_EMBEDDINGS_FILE_PATH, "rb") historic_word_embeddings = pickle.load(pkl_file) pkl_file.close() for word in historic_word_embeddings: if len(historic_word_embeddings[word]) == self.token_embedding_dimension: self.token_to_embedding_mapping[word] = historic_word_embeddings.get( word )
[docs] def save_embeddings(self): """Save extracted embeddings to historic pickle file.""" output = open(PREVIOUSLY_USED_WORD_EMBEDDINGS_FILE_PATH, "wb") pickle.dump(self.token_to_embedding_mapping, output) output.close()
[docs]class CharacterSequenceEmbedding: """CharacterSequenceEmbedding encodes a sequence of words into a sequence of fixed dimension real-numbered vectors by mapping each character in the words as vectors. """ def __init__( self, sequence_padding_length, token_embedding_dimension=None, max_char_per_word=None, ): """Initializes the CharacterSequenceEmbedding class Args: sequence_padding_length (int): padding length of the sequence after which the sequence is cut off token_embedding_dimension (int): The embedding dimension of the token max_char_per_word (int): The maximum number of characters per word """ self.token_embedding_dimension = token_embedding_dimension self.sequence_padding_length = sequence_padding_length self.max_char_per_word = max_char_per_word self.token_to_embedding_mapping = {} self._add_historic_embeddings()
[docs] def encode_sequence_of_tokens(self, token_sequence): """Encodes a sequence of tokens into real value vectors. Args: token_sequence (list): A sequence of tokens. Returns: (list): Encoded sequence of tokens. """ default_encoding = np.zeros(self.token_embedding_dimension) default_char_word = [default_encoding] * self.max_char_per_word encoded_query = [default_char_word] * self.sequence_padding_length for idx, word_token in enumerate(token_sequence): if idx >= self.sequence_padding_length: break encoded_word = [default_encoding] * self.max_char_per_word for idx2, char_token in enumerate(word_token): if idx2 >= self.max_char_per_word: break self._encode_token(char_token) encoded_word[idx2] = self.token_to_embedding_mapping[char_token] encoded_query[idx] = encoded_word return encoded_query
def _encode_token(self, token): """Encodes a token to its corresponding embedding Args: token (str): Individual token Returns: corresponding embedding """ if token not in self.token_to_embedding_mapping: random_vector = np.random.uniform( -1, 1, size=(self.token_embedding_dimension,) ) self.token_to_embedding_mapping[token] = random_vector return self.token_to_embedding_mapping[token] def _add_historic_embeddings(self): historic_char_embeddings = {} # load historic word embeddings if os.path.exists(PREVIOUSLY_USED_CHAR_EMBEDDINGS_FILE_PATH): pkl_file = open(PREVIOUSLY_USED_CHAR_EMBEDDINGS_FILE_PATH, "rb") historic_char_embeddings = pickle.load(pkl_file) pkl_file.close() for char in historic_char_embeddings: self.token_to_embedding_mapping[char] = historic_char_embeddings.get(char)
[docs] def save_embeddings(self): """Save extracted embeddings to historic pickle file.""" output = open(PREVIOUSLY_USED_CHAR_EMBEDDINGS_FILE_PATH, "wb") pickle.dump(self.token_to_embedding_mapping, output) output.close()