Python : Day 8 – Lesson 8

Word Embedding - Basics

!pip install pandas==0.25.3

!pip install numpy==1.17.3

!pip install Keras==2.3.1

!pip install tensorflow==2.0.0

!pip install tqdm==4.43.0

!pip install matplotlib==3.1.3

In [2]:


Collecting pandas==0.25.3

Using cached pandas-0.25.3-cp36-cp36m-win_amd64.whl (9.0 MB) Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\w in10\anaconda3\envs\nlp_projects\lib\site-packages (from pandas==0. 25.3) (2.8.1)

Collecting numpy>=1.13.3

Downloading numpy-1.19.3-cp36-cp36m-win_amd64.whl (13.2 MB) Collecting pytz>=2017.2

Using cached pytz-2020.1-py2.py3-none-any.whl (510 kB) Requirement already satisfied: six>=1.5 in c:\users\win10\anaconda3

\envs\nlp_projects\lib\site-packages (from python-dateutil>=2.6.1-> pandas==0.25.3) (1.15.0)

Installing collected packages: numpy, pytz, pandas Successfully installed numpy-1.19.3 pandas-0.25.3 pytz-2020.1 Collecting numpy==1.17.3

Downloading numpy-1.17.3-cp36-cp36m-win_amd64.whl (12.7 MB) Installing collected packages: numpy

Attempting uninstall: numpy

Found existing installation: numpy 1.19.3

In [2]:

import re

import numpy as np


def create_unique_word_dict(text:list) -> dict: """

A method that creates a dictionary where the keys are unique words and key values are indices

"""

# Getting all the unique words from our text and sorting them alph

words = list(set(text)) words.sort()


# Creating the dictionary for the unique words

unique_word_dict = {}

for i, word in enumerate(words): unique_word_dict.update({

word: i

})

print(unique_word_dict)

return unique_word_dict


def text_preprocessing( text:list,

punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~''',

stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']

)->list:

"""

A method to preproces text """

for x in text.lower():

if x in punctuations:

text = text.replace(x, "")


# Removing words that have numbers in them

text = re.sub(r'\w*\d\w*', '', text)


# Removing digits

text = re.sub(r'[0-9]+', '', text)


# Cleaning the whitespaces

text = re.sub(r'\s+', ' ', text).strip()


# Setting every word to lower

text = text.lower()


# Converting all our text to a list

text = text.split(' ')


# Droping empty strings

text = [x for x in text if x!='']


# Droping stop words

text = [x for x in text if x not in stop_words]


return text


# Functions to find the most similar word


def euclidean(vec1:np.array, vec2:np.array) -> float: """

A function to calculate the euclidean distance between two vectors """

return np.sqrt(np.sum((vec1 - vec2)**2))


def find_similar(word:str, embedding_dict:dict, top_n= 10)->list: """

A method to find the most similar word based on the learnt embeddi """

dist_dict = {}

word_vector = embedding_dict.get(word, [])

if len(word_vector) > 0:

for key, value in embedding_dict.items():

if key!=word:

dist = euclidean(word_vector, value) dist_dict.update({

key: dist

})


return sorted(dist_dict.items(), key=lambda x: x[1])[ 0:top_n]

In [68]:

import itertools import pandas as pd import numpy as np import re

import os

from tqdm import tqdm


# Drawing the embeddings

import matplotlib.pyplot as plt


# Deep learning:

from keras.models import Input, Model

from keras.layers import Dense


from scipy import sparse


# Custom functions

#from utility import text_preprocessing, create_unique_word_dict


# Reading the text from the input folder texts = pd.read_csv('input/sample.csv') texts = [x for x in texts['text']]


# Defining the window for context

window = 2


# Creating a placeholder for the scanning of the word list

word_lists = [] all_text = []


for text in texts:


# Cleaning the text

text = text_preprocessing(text) print(text)

# Appending to the all text list

all_text += text


# Creating a context dictionary

for i, word in enumerate(text): print("......i....." ,i, "....word. ",word)

for w in range(3):

# Getting the context that is ahead by *window* words

if i + 1 + w < len(text):

word_lists.append([word] + [text[(i + 1 + w)]])

# Getting the context that is behind by *window* words

if i - w - 1 >= 0:

word_lists.append([word] + [text[(i - w - 1)]])


print("word list " , word_lists)

unique_word_dict = create_unique_word_dict(all_text) print("unique_word_dict",unique_word_dict)


# Defining the number of features (unique words)

n_words = len(unique_word_dict)

# Getting all the unique words

words = list(unique_word_dict.keys()) print("words",words)

# Creating the X and Y matrices using one hot encoding

X = []

Y = []


for i, word_list in tqdm(enumerate(word_lists)):

# Getting the indices


main_word_index = unique_word_dict.get(word_list[0]) context_word_index = unique_word_dict.get(word_list[1]) print("......i....." ,i,"....word_list....",word_list , "word_list # Creating the placeholders

X_row = np.zeros(n_words) Y_row = np.zeros(n_words)


# One hot encoding the main word

X_row[main_word_index] = 1


# One hot encoding the Y matrix words

Y_row[context_word_index] = 1


# Appending to the main matrices

X.append(X_row) Y.append(Y_row)


# Converting the matrices into a sparse format because the vast majori

X = sparse.csr_matrix(X) Y = sparse.csr_matrix(Y)


# Defining the size of the embedding

embed_size = 2


# Defining the neural network

inp = Input(shape=(X.shape[1],))

x = Dense(units=embed_size, activation='linear')(inp) x = Dense(units =Y.shape[1], activation='softmax')(x) model = Model(inputs =inp, outputs=x)

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')


# Optimizing the network weights

model.fit(

x=X, y=Y,

batch_size=256, epochs=1000

)


# Obtaining the weights from the neural network. # These are the so called word embeddings


# The input layer

weights = model.get_weights()[0]


# Creating a dictionary to store the embeddings in. The key is a uniqu # the value is the numeric vector

embedding_dict = {}

for word in words: embedding_dict.update({

word: weights[unique_word_dict.get(word)]

})


# Ploting the embeddings

plt.figure(figsize=(10, 10))

for word in list(unique_word_dict.keys()): coord = embedding_dict.get(word) plt.scatter(coord[ 0], coord[1]) plt.annotate(word, (coord[0], coord[1]))


# Saving the embedding vector to a txt file

try:

os.mkdir(f'{os.getcwd()}\\output')

except Exception as e:

print(f'Cannot create output folder: {e}')


with open(f'{os.getcwd()}\\output\\embedding.txt', 'w') as f:

for key, value in embedding_dict.items():

try:

f.write(f'{key}: {value}\n')

except Exception as e:

print(f'Cannot write word {key} to dict: {e}')




98it [00:00, 8164.83it/s]

['future', 'king', 'prince']

......i..... 0 ....word. future

......i..... 1 ....word. king

......i..... 2 ....word. prince

['daughter', 'princess']

......i..... 0 ....word. daughter

......i..... 1 ....word. princess

['son', 'prince']

......i..... 0 ....word. son

......i..... 1 ....word. prince

['only', 'man', 'can', 'king']

......i..... 0 ....word. only

......i..... 1 ....word. man

......i..... 2 ....word. can

......i..... 3 ....word. king

['only', 'woman', 'can', 'queen']

......i..... 0 ....word. only

......i..... 1 ....word. woman


In [69]:

Out[69]:

embedding_dict

{'beautiful': array([-1.3688022, -1.2648398], dtype=float32), 'boy': array([-1.2045977, 1.3686044], dtype=float32), 'can': array([-0.5418091 , 0.26029423], dtype=float32), 'children': array([ 0.6832872, -1.0086254], dtype=float32), 'daughter': array([-1.1178044, -1.1369839], dtype=float32), 'family': array([ 0.9929922, -0.9225423], dtype=float32), 'future': array([-0.43754777, 0.3495058 ], dtype=float32), 'king': array([0.8600612 , 0.08342463], dtype=float32), 'man': array([0.05348093, 1.1474305 ], dtype=float32), 'now': array([-0.94651306, 0.7657763 ], dtype=float32), 'only': array([-0.8729304 , 0.46015522], dtype=float32), 'prince': array([0.6690954, 1.212434 ], dtype=float32), 'princess': array([0.8468873, 0.5226464], dtype=float32), 'queen': array([ 0.45890674, -0.7914091 ], dtype=float32),

'realm': array([ 0.43352002, -0.9279783 ], dtype=float32),

'royal': array([ 0.57931876, -1.0499104 ], dtype=float32),

'rule': array([ 0.82716703, -0.5857244 ], dtype=float32), 'son': array([-0.5071175, 1.6089387], dtype=float32), 'strong': array([-1.0395274, 1.5985358], dtype=float32), 'their': array([ 0.66103125, -0.9436734 ], dtype=float32), 'woman': array([-0.34705597, -0.08890723], dtype=float32)}


In [66]:

Out[66]:

model.get_weights()[0]

array([[-0.4522477 ],

[-1.2182469 ],

[-0.2559153 ],

[ 1.1059917 ],

[-0.18299527],

[ 1.2638413 ],

[-0.37547705],

[ 0.23008995],

[-0.5719701 ],

[-0.95362383],

[-0.46083274],

[-0.5689876 ],

[-0.63507533],

[

0.5916492

],

[

1.2399746

],

[

1.1080786

],

[ 0.90944767],

[-1.4457749

],

[-1.5976491

],

[ 1.5208826

],

[-0.28549275]], dtype=float32)


X.shape[1]

In [52]:


Out[52]: 21


Y.shape[1]

In [53]:

Out[53]: 21


n_words

In [39]:

Out[39]: 21


In [38]:

Out[38]:

unique_word_dict

{'beautiful': 0,

'boy': 1,

'can': 2,

'children': 3,

'daughter': 4,

'family': 5,

'future': 6,

'king': 7,

'man': 8,

'now': 9,

'only': 10,

'prince': 11,

'princess': 12,

'queen': 13,

'realm': 14,

'royal': 15,

'rule': 16,

'son': 17,

'strong': 18,

'their': 19,

'woman': 20}


len(word_lists)

In [49]:

Out[49]: 98


In [35]:

[['future', 'king'], ['future', 'prince'], ['king', 'prince'], ['king'

['prince', 'king'], ['prince', 'future'], [ 'daughter', 'princess'],

['son', 'prince'], ['prince', 'son'], [ 'only', 'man'], ['only', 'can

['man', 'only'], ['man', 'king'], [ 'can', 'king'], ['can', 'man'], [

['king', 'only'], ['only', 'woman'], [ 'only', 'can'], ['only', 'quee

['woman', 'queen'], ['can', 'queen'], [ 'can', 'woman'], ['can', 'onl

['queen', 'only'], ['princess', 'queen'], [ 'queen', 'princess'], ['q

['queen', 'realm'], ['king', 'rule'], [ 'king', 'queen'], ['king', 'r

['rule', 'king'], ['rule', 'queen'], [ 'realm', 'rule'], ['realm', 'k

['prince', 'man'], ['strong', 'man'], [ 'strong', 'prince'], ['man',

['princess', 'beautiful'], ['princess', 'woman'], [ 'beautiful', 'wom

['woman', 'beautiful'], ['woman', 'princess'], [ 'royal', 'family'],

['family', 'king'], ['family', 'royal'], [ 'family', 'queen'], ['fami

['king', 'family'], ['king', 'their'], [ 'king', 'royal'], ['king', '

['queen', 'children'], ['queen', 'family'], [ 'queen', 'royal'], ['th

['their', 'king'], ['their', 'family'], [ 'children', 'their'], ['chi

['prince', 'only'], ['prince', 'boy'], [ 'prince', 'now'], ['only', '

['boy', 'now'], ['boy', 'only'], [ 'boy', 'prince'], ['now', 'boy'], [ 'boy', 'man'], ['man', 'boy'] ]


Out[35]: ['The future king is the prince', 'Daughter is the princess ', 'Son is the prince',

'Only a man can be a king ', 'Only a woman can be a queen', 'The princess will be a queen', 'Queen and king rule the realm', 'The prince is a strong man',

'The princess is a beautiful woman ',

'The royal family is the king and queen and their children', 'Prince is only a boy now',

'A boy will be a man']


In [21]:

!pip install nltk

Processing c:\users\win10\appdata\local\pip\cache\wheels\de\5e\42\64a baeca668161c3e2cecc24f864a8fc421e3d07a104fc8a51\nltk-3.5-py3-none-an y.whl

Requirement already satisfied: regex in c:\users\win10\anaconda3\envs

\nlp_projects\lib\site-packages (from nltk) (2020.10.28)

Requirement already satisfied: joblib in c:\users\win10\anaconda3\env s\nlp_projects\lib\site-packages (from nltk) (0.17.0)

Requirement already satisfied: click in c:\users\win10\anaconda3\envs

\nlp_projects\lib\site-packages (from nltk) (7.1.2)

Requirement already satisfied: tqdm in c:\users\win10\anaconda3\envs

\nlp_projects\lib\site-packages (from nltk) (4.43.0) Installing collected packages: nltk

Successfully installed nltk-3.5


from nltk.corpus import brown

model = gensim.models.Word2Vec(brown.sents())

In [30]: