Word Embedding - Basics
!pip install pandas==0.25.3
!pip install numpy==1.17.3
!pip install Keras==2.3.1
!pip install tensorflow==2.0.0
!pip install tqdm==4.43.0
!pip install matplotlib==3.1.3
In [2]:
Collecting pandas==0.25.3
Using cached pandas-0.25.3-cp36-cp36m-win_amd64.whl (9.0 MB) Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\w in10\anaconda3\envs\nlp_projects\lib\site-packages (from pandas==0. 25.3) (2.8.1)
Collecting numpy>=1.13.3
Downloading numpy-1.19.3-cp36-cp36m-win_amd64.whl (13.2 MB) Collecting pytz>=2017.2
Using cached pytz-2020.1-py2.py3-none-any.whl (510 kB) Requirement already satisfied: six>=1.5 in c:\users\win10\anaconda3
\envs\nlp_projects\lib\site-packages (from python-dateutil>=2.6.1-> pandas==0.25.3) (1.15.0)
Installing collected packages: numpy, pytz, pandas Successfully installed numpy-1.19.3 pandas-0.25.3 pytz-2020.1 Collecting numpy==1.17.3
Downloading numpy-1.17.3-cp36-cp36m-win_amd64.whl (12.7 MB) Installing collected packages: numpy
Attempting uninstall: numpy
Found existing installation: numpy 1.19.3
In [2]:
import re
import numpy as np
def create_unique_word_dict(text:list) -> dict: """
A method that creates a dictionary where the keys are unique words and key values are indices
"""
# Getting all the unique words from our text and sorting them alph
words = list(set(text)) words.sort()
# Creating the dictionary for the unique words
unique_word_dict = {}
for i, word in enumerate(words): unique_word_dict.update({
word: i
})
print(unique_word_dict)
return unique_word_dict
def text_preprocessing( text:list,
punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''',
stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']
)->list:
"""
A method to preproces text """
for x in text.lower():
if x in punctuations:
text = text.replace(x, "")
# Removing words that have numbers in them
text = re.sub(r'\w*\d\w*', '', text)
# Removing digits
text = re.sub(r'[0-9]+', '', text)
# Cleaning the whitespaces
text = re.sub(r'\s+', ' ', text).strip()
# Setting every word to lower
text = text.lower()
# Converting all our text to a list
text = text.split(' ')
# Droping empty strings
text = [x for x in text if x!='']
# Droping stop words
text = [x for x in text if x not in stop_words]
return text
# Functions to find the most similar word
def euclidean(vec1:np.array, vec2:np.array) -> float: """
A function to calculate the euclidean distance between two vectors """
return np.sqrt(np.sum((vec1 - vec2)**2))
def find_similar(word:str, embedding_dict:dict, top_n= 10)->list: """
A method to find the most similar word based on the learnt embeddi """
dist_dict = {}
word_vector = embedding_dict.get(word, [])
if len(word_vector) > 0:
for key, value in embedding_dict.items():
if key!=word:
dist = euclidean(word_vector, value) dist_dict.update({
key: dist
})
return sorted(dist_dict.items(), key=lambda x: x[1])[ 0:top_n]
In [68]:
import itertools import pandas as pd import numpy as np import re
import os
from tqdm import tqdm
# Drawing the embeddings
import matplotlib.pyplot as plt
# Deep learning:
from keras.models import Input, Model
from keras.layers import Dense
from scipy import sparse
# Custom functions
#from utility import text_preprocessing, create_unique_word_dict
# Reading the text from the input folder texts = pd.read_csv('input/sample.csv') texts = [x for x in texts['text']]
# Defining the window for context
window = 2
# Creating a placeholder for the scanning of the word list
word_lists = [] all_text = []
for text in texts:
# Cleaning the text
text = text_preprocessing(text) print(text)
# Appending to the all text list
all_text += text
# Creating a context dictionary
for i, word in enumerate(text): print("......i....." ,i, "....word. ",word)
for w in range(3):
# Getting the context that is ahead by *window* words
if i + 1 + w < len(text):
word_lists.append([word] + [text[(i + 1 + w)]])
# Getting the context that is behind by *window* words
if i - w - 1 >= 0:
word_lists.append([word] + [text[(i - w - 1)]])
print("word list " , word_lists)
unique_word_dict = create_unique_word_dict(all_text) print("unique_word_dict",unique_word_dict)
# Defining the number of features (unique words)
n_words = len(unique_word_dict)
# Getting all the unique words
words = list(unique_word_dict.keys()) print("words",words)
# Creating the X and Y matrices using one hot encoding
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
# Getting the indices
main_word_index = unique_word_dict.get(word_list[0]) context_word_index = unique_word_dict.get(word_list[1]) print("......i....." ,i,"....word_list....",word_list , "word_list # Creating the placeholders
X_row = np.zeros(n_words) Y_row = np.zeros(n_words)
# One hot encoding the main word
X_row[main_word_index] = 1
# One hot encoding the Y matrix words
Y_row[context_word_index] = 1
# Appending to the main matrices
X.append(X_row) Y.append(Y_row)
# Converting the matrices into a sparse format because the vast majori
X = sparse.csr_matrix(X) Y = sparse.csr_matrix(Y)
# Defining the size of the embedding
embed_size = 2
# Defining the neural network
inp = Input(shape=(X.shape[1],))
x = Dense(units=embed_size, activation='linear')(inp) x = Dense(units =Y.shape[1], activation='softmax')(x) model = Model(inputs =inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
# Optimizing the network weights
model.fit(
x=X, y=Y,
batch_size=256, epochs=1000
)
# Obtaining the weights from the neural network. # These are the so called word embeddings
# The input layer
weights = model.get_weights()[0]
# Creating a dictionary to store the embeddings in. The key is a uniqu # the value is the numeric vector embedding_dict = {} for word in words: embedding_dict.update({ word: weights[unique_word_dict.get(word)] }) # Ploting the embeddings plt.figure(figsize=(10, 10)) for word in list(unique_word_dict.keys()): coord = embedding_dict.get(word) plt.scatter(coord[ 0], coord[1]) plt.annotate(word, (coord[0], coord[1])) # Saving the embedding vector to a txt file try: os.mkdir(f'{os.getcwd()}\\output') except Exception as e: print(f'Cannot create output folder: {e}') with open(f'{os.getcwd()}\\output\\embedding.txt', 'w') as f: for key, value in embedding_dict.items(): try: f.write(f'{key}: {value}\n') except Exception as e: print(f'Cannot write word {key} to dict: {e}') |
||
|
|
|
98it [00:00, 8164.83it/s]
['future', 'king', 'prince']
......i..... 0 ....word. future
......i..... 1 ....word. king
......i..... 2 ....word. prince
['daughter', 'princess']
......i..... 0 ....word. daughter
......i..... 1 ....word. princess
['son', 'prince']
......i..... 0 ....word. son
......i..... 1 ....word. prince
['only', 'man', 'can', 'king']
......i..... 0 ....word. only
......i..... 1 ....word. man
......i..... 2 ....word. can
......i..... 3 ....word. king
['only', 'woman', 'can', 'queen']
......i..... 0 ....word. only
......i..... 1 ....word. woman
In [69]:
Out[69]:
embedding_dict
{'beautiful': array([-1.3688022, -1.2648398], dtype=float32), 'boy': array([-1.2045977, 1.3686044], dtype=float32), 'can': array([-0.5418091 , 0.26029423], dtype=float32), 'children': array([ 0.6832872, -1.0086254], dtype=float32), 'daughter': array([-1.1178044, -1.1369839], dtype=float32), 'family': array([ 0.9929922, -0.9225423], dtype=float32), 'future': array([-0.43754777, 0.3495058 ], dtype=float32), 'king': array([0.8600612 , 0.08342463], dtype=float32), 'man': array([0.05348093, 1.1474305 ], dtype=float32), 'now': array([-0.94651306, 0.7657763 ], dtype=float32), 'only': array([-0.8729304 , 0.46015522], dtype=float32), 'prince': array([0.6690954, 1.212434 ], dtype=float32), 'princess': array([0.8468873, 0.5226464], dtype=float32), 'queen': array([ 0.45890674, -0.7914091 ], dtype=float32),
'realm': array([ 0.43352002, -0.9279783 ], dtype=float32),
'royal': array([ 0.57931876, -1.0499104 ], dtype=float32),
'rule': array([ 0.82716703, -0.5857244 ], dtype=float32), 'son': array([-0.5071175, 1.6089387], dtype=float32), 'strong': array([-1.0395274, 1.5985358], dtype=float32), 'their': array([ 0.66103125, -0.9436734 ], dtype=float32), 'woman': array([-0.34705597, -0.08890723], dtype=float32)}
In [66]:
Out[66]:
model.get_weights()[0]
array([[-0.4522477 ],
[-1.2182469 ],
[-0.2559153 ],
[ 1.1059917 ],
[-0.18299527],
[ 1.2638413 ],
[-0.37547705],
[ 0.23008995],
[-0.5719701 ],
[-0.95362383],
[-0.46083274],
[-0.5689876 ],
[-0.63507533],
[ |
0.5916492 |
], |
[ |
1.2399746 |
], |
[ |
1.1080786 |
], |
[ 0.90944767], |
||
[-1.4457749 |
], |
|
[-1.5976491 |
], |
|
[ 1.5208826 |
], |
[-0.28549275]], dtype=float32)
X.shape[1]
In [52]:
Out[52]: 21
Y.shape[1]
In [53]:
Out[53]: 21
n_words
In [39]:
Out[39]: 21
In [38]:
Out[38]:
unique_word_dict
{'beautiful': 0,
'boy': 1,
'can': 2,
'children': 3,
'daughter': 4,
'family': 5,
'future': 6,
'king': 7,
'man': 8,
'now': 9,
'only': 10,
'prince': 11,
'princess': 12,
'queen': 13,
'realm': 14,
'royal': 15,
'rule': 16,
'son': 17,
'strong': 18,
'their': 19,
'woman': 20}
len(word_lists)
In [49]:
Out[49]: 98
In [35]:
[['future', 'king'], ['future', 'prince'], ['king', 'prince'], ['king'
['prince', 'king'], ['prince', 'future'], [ 'daughter', 'princess'],
['son', 'prince'], ['prince', 'son'], [ 'only', 'man'], ['only', 'can
['man', 'only'], ['man', 'king'], [ 'can', 'king'], ['can', 'man'], [
['king', 'only'], ['only', 'woman'], [ 'only', 'can'], ['only', 'quee
['woman', 'queen'], ['can', 'queen'], [ 'can', 'woman'], ['can', 'onl
['queen', 'only'], ['princess', 'queen'], [ 'queen', 'princess'], ['q
['queen', 'realm'], ['king', 'rule'], [ 'king', 'queen'], ['king', 'r
['rule', 'king'], ['rule', 'queen'], [ 'realm', 'rule'], ['realm', 'k
['prince', 'man'], ['strong', 'man'], [ 'strong', 'prince'], ['man',
['princess', 'beautiful'], ['princess', 'woman'], [ 'beautiful', 'wom
['woman', 'beautiful'], ['woman', 'princess'], [ 'royal', 'family'],
['family', 'king'], ['family', 'royal'], [ 'family', 'queen'], ['fami
['king', 'family'], ['king', 'their'], [ 'king', 'royal'], ['king', '
['queen', 'children'], ['queen', 'family'], [ 'queen', 'royal'], ['th
['their', 'king'], ['their', 'family'], [ 'children', 'their'], ['chi
['prince', 'only'], ['prince', 'boy'], [ 'prince', 'now'], ['only', '
['boy', 'now'], ['boy', 'only'], [ 'boy', 'prince'], ['now', 'boy'], [ 'boy', 'man'], ['man', 'boy'] ]
Out[35]: ['The future king is the prince', 'Daughter is the princess ', 'Son is the prince',
'Only a man can be a king ', 'Only a woman can be a queen', 'The princess will be a queen', 'Queen and king rule the realm', 'The prince is a strong man',
'The princess is a beautiful woman ',
'The royal family is the king and queen and their children', 'Prince is only a boy now',
'A boy will be a man']
In [21]:
!pip install nltk
Processing c:\users\win10\appdata\local\pip\cache\wheels\de\5e\42\64a baeca668161c3e2cecc24f864a8fc421e3d07a104fc8a51\nltk-3.5-py3-none-an y.whl
Requirement already satisfied: regex in c:\users\win10\anaconda3\envs
\nlp_projects\lib\site-packages (from nltk) (2020.10.28)
Requirement already satisfied: joblib in c:\users\win10\anaconda3\env s\nlp_projects\lib\site-packages (from nltk) (0.17.0)
Requirement already satisfied: click in c:\users\win10\anaconda3\envs
\nlp_projects\lib\site-packages (from nltk) (7.1.2)
Requirement already satisfied: tqdm in c:\users\win10\anaconda3\envs
\nlp_projects\lib\site-packages (from nltk) (4.43.0) Installing collected packages: nltk
Successfully installed nltk-3.5
from nltk.corpus import brown
model = gensim.models.Word2Vec(brown.sents())
In [30]: