News Group Topic Modelling

Import Libraries

import nltk
from nltk.corpus import names
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import numpy as np

Select articles of interest using a new class

groups = fetch_20newsgroups()
class newsgroup:
    data = [];  target = [];
selected = newsgroup();

for i in range(len(groups.data)):
    if (groups.target[i]==8 or groups.target[i]==12 or groups.target[i]== 19):
        selected.data.append( groups.data[i])
        selected.target.append(groups.target[i])

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)

Get 100 most frequent words and display the first 5

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english", max_features=100)
bag_of_words = cv.fit_transform(selected.data)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
print("100 most frequent in the selected articles and their frequency\n")
print('%-13s: %4s' % ('WORD', 'FREQ'))
print('___________________')
i = 0; 
for word, count in words_freq:
    i = i + 1;
    print('%-13s: %4g' % (word, count))# %print(word + ":", count)
    if(i==5): break;

100 most frequent in the selected articles and their frequency

WORD         : FREQ
___________________
edu          : 2349
com          : 2083
subject      : 1705
lines        : 1647
organization : 1596

Create function to remove numbers and puntuation

# Used to remove numbers and punctuation
def letters_only(astr):
    return astr.isalpha()

List words that appear in most documents and may not help clustering

unwanted_words = ['edu', 'subject', 'lines', 'organizer', 'com', 'article', 'like', 'just', 'know', 'mr', 'think', 'university', 'say']

Remove unwated words and apply lemmatization

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer;
cleaned = []; lemmatizer = WordNetLemmatizer()
for post in selected.data:
    cleaned.append(' '.join(lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word.lower() not in unwanted_words))
cleaned_bag_of_words = cv.fit_transform(cleaned)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.

Perform K-Means clustering

from sklearn.cluster import KMeans
sum_sq_dist = []
num_clusters = list(range(1, 16))

for n in num_clusters:
    km = KMeans(n_clusters = n)
    km.fit(cleaned_bag_of_words)
    sum_sq_dist.append(km.inertia_)

# Plot sum of square distances against number of clusters
plt.figure(figsize=(6,4))
plt.plot(num_clusters, sum_sq_dist , '-+')
plt.title('The Elbow Method for Optimal k')
plt.xlabel(r'Number of Clusters, k')
plt.ylabel('Sum of Squared Distance')
plt.show()

Sum of Squared Distance

Use the optimal k = 4 plot top words in each article

from sklearn.decomposition import NMF
optimal_k = 4
km = KMeans(n_clusters = optimal_k)
km.fit(cleaned_bag_of_words)
print("Displaying top 3 articles (topics) in each cluster after optimization")
print("_____________________________________________________________________")
for j in range(optimal_k):
    group_indices = np.where(km.labels_==j)
    group_docs = [cleaned[i] for i in group_indices[0]]
    if len(group_indices[0]) > 2:
        group_words = cv.fit_transform(group_docs)
        #topic modelling
        nmf = NMF(n_components = 3).fit(group_words) #only top 3 articles needed
        print("Cluster %g" %j)
        for topic_idx, topic in enumerate(nmf.components_):
            label ='{}: '.format(topic_idx) # Display topic number in cluster
            #show first 10 words in this topic (post)
            print(label, " ".join([cv.get_feature_names()[i] for i in topic.argsort()[:-10:-1]])) 
           

Displaying top 3 articles (topics) in each cluster after optimization
_____________________________________________________________________
Cluster 2
0:  wa did people way christian got right jesus going
1:  ha use want good doe need make power thing
2:  bike dog ride new run motorcycle helmet apr dod
Cluster 3
0:  juda wa act matthew greek word ha doe passage
1:  jesus god christian bible child people ha good life
2:  wa magi people ha gay new time zoroastrian did