Import Libraries
import nltk
from nltk.corpus import names
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import numpy as np
Select articles of interest using a new class
groups = fetch_20newsgroups()
class newsgroup:
data = []; target = [];
selected = newsgroup();
for i in range(len(
if ([i]==8 or[i]==12 or[i]== 19):[i])[i])
Downloading 20news dataset. This may take a few minutes.
Downloading dataset from (14 MB)
Get 100 most frequent words and display the first 5
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english", max_features=100)
bag_of_words = cv.fit_transform(
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
print("100 most frequent in the selected articles and their frequency\n")
print('%-13s: %4s' % ('WORD', 'FREQ'))
i = 0;
for word, count in words_freq:
i = i + 1;
print('%-13s: %4g' % (word, count))# %print(word + ":", count)
if(i==5): break;
100 most frequent in the selected articles and their frequency
edu : 2349
com : 2083
subject : 1705
lines : 1647
organization : 1596
Create function to remove numbers and puntuation
# Used to remove numbers and punctuation
def letters_only(astr):
return astr.isalpha()
List words that appear in most documents and may not help clustering
unwanted_words = ['edu', 'subject', 'lines', 'organizer', 'com', 'article', 'like', 'just', 'know', 'mr', 'think', 'university', 'say']
Remove unwated words and apply lemmatization'wordnet')
from nltk.stem import WordNetLemmatizer;
cleaned = []; lemmatizer = WordNetLemmatizer()
for post in
cleaned.append(' '.join(lemmatizer.lemmatize(word.lower()) for word in post.split() if letters_only(word) and word.lower() not in unwanted_words))
cleaned_bag_of_words = cv.fit_transform(cleaned)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/
Perform K-Means clustering
from sklearn.cluster import KMeans
sum_sq_dist = []
num_clusters = list(range(1, 16))
for n in num_clusters:
km = KMeans(n_clusters = n)
# Plot sum of square distances against number of clusters
plt.plot(num_clusters, sum_sq_dist , '-+')
plt.title('The Elbow Method for Optimal k')
plt.xlabel(r'Number of Clusters, k')
plt.ylabel('Sum of Squared Distance')
Use the optimal k = 4 plot top words in each article
from sklearn.decomposition import NMF
optimal_k = 4
km = KMeans(n_clusters = optimal_k)
print("Displaying top 3 articles (topics) in each cluster after optimization")
for j in range(optimal_k):
group_indices = np.where(km.labels_==j)
group_docs = [cleaned[i] for i in group_indices[0]]
if len(group_indices[0]) > 2:
group_words = cv.fit_transform(group_docs)
#topic modelling
nmf = NMF(n_components = 3).fit(group_words) #only top 3 articles needed
print("Cluster %g" %j)
for topic_idx, topic in enumerate(nmf.components_):
label ='{}: '.format(topic_idx) # Display topic number in cluster
#show first 10 words in this topic (post)
print(label, " ".join([cv.get_feature_names()[i] for i in topic.argsort()[:-10:-1]]))
Displaying top 3 articles (topics) in each cluster after optimization
Cluster 2
0: wa did people way christian got right jesus going
1: ha use want good doe need make power thing
2: bike dog ride new run motorcycle helmet apr dod
Cluster 3
0: juda wa act matthew greek word ha doe passage
1: jesus god christian bible child people ha good life
2: wa magi people ha gay new time zoroastrian did