Scatter plotting text clustering results

1 year ago

#388295

Ameer_90

I am trying to scatter plot results of the clusters produced using k-mean clustering algorithm but I'm getting an error. should I use the embeddings instead or is it an indexing problem? The clustering results were ok and I want a way to visualize the clusters and tried the scatter plot method. The error message:

KeyError                                  Traceback (most recent call last)

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

4 frames

/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

/usr/local/lib/python3.7/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 15


The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)

<ipython-input-63-52bc7659b609> in <module>()
      2 
      3 for i in u_labels:
----> 4     plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
      5     #plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
      6 plt.legend()

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   3456             if self.columns.nlevels > 1:
   3457                 return self._getitem_multilevel(key)
-> 3458             indexer = self.columns.get_loc(key)
   3459             if is_integer(indexer):
   3460                 indexer = [indexer]

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 15

#Clustering our data using K-mean and Word2Vec embeddings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string

drive.mount('/content/gdrive/',force_remount=True)

txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None

nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
  tokenized=word_tokenize(prescription)
  cleaned= [word for word in tokenized if len(word)>3]
  tokenized_text.append(cleaned)

model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)

#This function will vectorize the prescriptions and return features
def vectorize(list_of_docs, model): # edit 
    
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])

#K-means Clustering using mini batches
def mbkmeans_clusters(
      X, # Matrix of features
    k, # Clusters number
    mb, # mini batch size
    print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics

    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_  #  Trained clustering model and labels based on X

clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=17, 
    mb=1000, 
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": tokenized_text,
    "tokens": [" ".join(text) for text in tokenized_text],
    "cluster": cluster_labels
})

'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286

'''
#Presenting the most relevant words in each cluster and examine them
print("Most representative terms in each cluster:")
for i in range(17): #50
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=20)
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")
#
Clustering our data using K-mean and Word2Vec embeddings
Necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
import gensim
from gensim import corpora
from google.colab import drive
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import re
import string

Mounting the drive and reading the csv file

drive.mount('/content/gdrive/',force_remount=True)

Mounted at /content/gdrive/


txt_reader=pd.read_csv('/content/gdrive/MyDrive/project_raw_data/preprocessing_results.txt',sep=";")#,header=None

Mounted at /content/gdrive/

Tokenize the prescriptions

nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_text=[]
for prescription in txt_reader:
  tokenized=word_tokenize(prescription)
  cleaned= [word for word in tokenized if len(word)>3]
  tokenized_text.append(cleaned)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

Initialzing an instance of the Word2vec model with our toknized text

model = Word2Vec(sentences=tokenized_text, size=300, workers=64,seed=1)
# sentences is a list of lists containing the tokenized text.
# Error encountered :use size instead of vector_size which is the length of the vector representing the words.
# the number of workers is like the number of independent working cores to parallelize the process and make it faster.
# by fixing the seed I tried to make the results as consistent as possible with every excution of the notebook.

This function will vectorize the prescriptions and return features

def vectorize(list_of_docs, model): # edit 
    
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

Vectorizing the prescriptions

vectorized_docs = vectorize(tokenized_text, model=model)
len(vectorized_docs)
len(vectorized_docs[0])

300

K-means Clustering using mini batches

def mbkmeans_clusters(
      X, # Matrix of features
    k, # Clusters number
    mb, # mini batch size
    print_silhouette_values, #per cluster
):
# Generate clusters and use MBKmeans to print Silhouette metrics

    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_  #  Trained clustering model and labels based on X

clustering, cluster_labels = mbkmeans_clusters(
    X=vectorized_docs,
    k=17, 
    mb=1000, 
    print_silhouette_values=True,
)
df_clusters = pd.DataFrame({
    "text": tokenized_text,
    "tokens": [" ".join(text) for text in tokenized_text],
    "cluster": cluster_labels
})

'''
For n_clusters = 20
Silhouette coefficient: 0.11
Inertia:973821.9586816286

'''

#trying to scatter plot results
#Getting unique labels
u_labels = np.unique(cluster_labels)
df_clusters

#plotting the results:
 
for i in u_labels:
    #plt.scatter(df_clusters [cluster_labels[i] ] , df_clusters [cluster_labels[i] ] , label = i)
    plt.scatter(df_clusters[u_labels == i , 0] , df_clusters[u_labels == i , 1] , u_labels = i)
plt.legend()
plt.show()

python

pandas

dictionary

scatter-plot

word2vec

0 Answers

Your Answer

Posts

Questions

Blogs

Jobs