1 year ago
#388295
Ameer_90
Scatter plotting text clustering results
I am trying to scatter plot results of the clusters produced using k-mean clustering algorithm but I'm getting an error. should I use the embeddings instead or is it an indexing problem? The clustering results were ok and I want a way to visualize the clusters and tried the scatter plot method. The error message:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 15
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-63-52bc7659b609> in <module>()
2
3 for i in u_labels:
----> 4 plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
5 #plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
6 plt.legend()
/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in __getitem__(self, key)
3456 if self.columns.nlevels > 1:
3457 return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
3459 if is_integer(indexer):
3460 indexer = [indexer]
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
3365 if is_scalar(key) and isna(key) and not self.hasnans:
KeyError: 15
#Clustering our data using K-mean and Word2Vec embeddings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
drive.mount('/content/gdrive/',force_remount=True)
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
#This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
#K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#Presenting the most relevant words in each cluster and examine them
print("Most representative terms in each cluster:")
for i in range(17): #50
tokens_per_cluster = ""
most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=20)
for t in most_representative:
tokens_per_cluster += f"{t[0]} "
print(f"Cluster {i}: {tokens_per_cluster}")
#
Clustering our data using K-mean and Word2Vec embeddings
Necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string
Mounting the drive and reading the csv file
drive.mount('/content/gdrive/',force_remount=True)
Mounted at /content/gdrive/
txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None
Mounted at /content/gdrive/
Tokenize the prescriptions
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
tokenized=word_tokenize(prescription)
cleaned= [word for word in tokenized if len(word)>3]
tokenized_text.append(cleaned)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
Initialzing an instance of the Word2vec model with our toknized text
model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
# sentences is a list of lists containing the tokenized text.
# Error encountered :use size instead of vector_size which is the length of the vector representing the words.
# the number of workers is like the number of independent working cores to parallelize the process and make it faster.
# by fixing the seed I tried to make the results as consistent as possible with every excution of the notebook.
This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit
features = []
for tokens in list_of_docs:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
Vectorizing the prescriptions
vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])
300
K-means Clustering using mini batches
def mbkmeans_clusters(
X, # Matrix of features
k, # Clusters number
mb, # mini batch size
print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics
km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
print(f"For n_clusters = {k}")
print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
print(f"Inertia:{km.inertia_}")
if print_silhouette_values:
sample_silhouette_values = silhouette_samples(X, km.labels_)
print(f"Silhouette values:")
silhouette_values = []
for i in range(k):
cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
silhouette_values.append(
(
i,
cluster_silhouette_values.shape[0],
cluster_silhouette_values.mean(),
cluster_silhouette_values.min(),
cluster_silhouette_values.max(),
)
)
silhouette_values = sorted(
silhouette_values, key=lambda tup: tup[2], reverse=True
)
for s in silhouette_values:
print(
f" Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
)
return km, km.labels_ # Trained clustering model and labels based on X
clustering, cluster_labels = mbkmeans_clusters(
X=vectorized_docs,
k=17,
mb=1000,
print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
"text": tokenized_text,
"tokens": [" ".join(text) for text in tokenized_text],
"cluster": cluster_labels
})
'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286
'''
#trying to scatter plot results
#Getting unique labels
u_labels = np.unique(cluster_labels)
df_clusters
#plotting the results:
for i in u_labels:
#plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
plt.legend()
plt.show()
python
pandas
dictionary
scatter-plot
word2vec
0 Answers
Your Answer