from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Opps').getOrCreate()
#import nltk
#nltk.download('all')


from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
lmtzr = WordNetLemmatizer() # see https://www.nltk.org/_modules/nltk/stem/wordnet.html for details

# The lemmatizer incorrectly deals with the below words.
# It trims the last "s" off so I have added them to a list of tokens to not lemmatize
no_lemmatize = ["nhs", "wales"]

# this line gets the data from hdfs. It is possible because of the 4th step in the computing setup above.
speeches_path = "hdfs:///covid_speeches.csv" 
speeches_df = spark.read.csv(speeches_path, 
                    header='true', sep=',')

def get_tokens(line):
    ###
    import nltk
    #nltk.download('all')
    ###
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove stopwords
    # remove punctuation from each word
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    words = [w for w in words if not w in stop_words]
    # lemmatizing the words
    words = [lmtzr.lemmatize(w) for w in words if not w in no_lemmatize]
    return (words)

# create an rdd and apply the get_tokens function to each document
speeches_rdd = speeches_df.rdd.map(tuple)
speeches_rdd = speeches_rdd.map(lambda line: (1, get_tokens(line[0])))


from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

# create a spark dataframe and add a row_index column
speeches_df=speeches_df.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

# these are the variables we have access to aside from the text data
speeches_df.show(3)

+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
|              speech|person_id|first_name|last_name|      date|       Party|     Constituency|row_index|
+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
|On behalf of the ...|    10295|   Lindsay|    Hoyle|2020-01-07|     Speaker|          Chorley|        1|
|Whether he plans ...|    25886|      Mark|    Logan|2020-01-07|Conservative|Bolton North East|        2|
|First, Mr Speaker...|    24854|     Sajid|    Javid|2020-01-07|Conservative|       Bromsgrove|        3|
+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
only showing top 3 rows


# additionally filter out words that occur very frequently or very infrequently 
doc_stop_words = speeches_rdd.flatMap(lambda r: r[1]).map(lambda r: (r,1)).reduceByKey(lambda a,b: a+b)
doc_stop_words = doc_stop_words.filter(lambda a: a[1]<2 or a[1]>10000).map(lambda r: r[0]).collect()

# throw away stop words and words that are just single letters.
speeches_rdd = speeches_rdd.map(lambda r: (r[0], [w for w in r[1] if not w in doc_stop_words and not len(w)==1]))

# view some of the stopwords we've removed
# many of them are typos that would not contribute to the model
doc_stop_words[50:59]

['lawmost',
 'accordinglythat',
 'yearsnew',
 'keepwhen',
 'irelandon',
 'northwestturning',
 'banni',
 'antiunion',
 'inappropriatethe']


from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import monotonically_increasing_id

# create a dataframe for modelling
tokens_df = spark.createDataFrame(speeches_rdd, ["dummy","words"])
tokens_df.cache()
tokens_df.show(5)

+-----+--------------------+
|dummy|               words|
+-----+--------------------+
|    1|[behalf, whole, w...|
|    1|[whether, plan, i...|
|    1|[mr, speaker, ass...|
|    1|[election, campai...|
|    1|[remember, visit,...|
+-----+--------------------+
only showing top 5 rows


# this is one of the steps I timed in order to experiment with different numbers of GCP nodes
#import timeit
#start_time = timeit.default_timer()

# create term-frequency matrix
cv = CountVectorizer(inputCol="words", outputCol="features", minDF=2)
cv_model = cv.fit(tokens_df)

speeches_df_w_features = cv_model.transform(tokens_df)
speeches_df_w_features.cache()
speeches_df_w_features.show(5)
#elapsed = timeit.default_timer() - start_time
#print(elapsed)

+-----+--------------------+--------------------+
|dummy|               words|            features|
+-----+--------------------+--------------------+
|    1|[behalf, whole, w...|(40170,[14,17,22,...|
|    1|[whether, plan, i...|(40170,[46,53,55,...|
|    1|[mr, speaker, ass...|(40170,[29,37,55,...|
|    1|[election, campai...|(40170,[23,55,73,...|
|    1|[remember, visit,...|(40170,[8,23,37,4...|
+-----+--------------------+--------------------+
only showing top 5 rows


print ("Vocabulary from CountVectorizerModel is:")
print(cv_model.vocabulary[:100])
print("\n---\n")

W = len(cv_model.vocabulary)
print("Number of terms W = ", W)

Vocabulary from CountVectorizerModel is:
['like', 'thank', 'look', 'going', 'question', 'national', 'sector', 'system', 'number', 'working', 'sure', 'thing', 'course', 'area', 'day', 'prime', 'gentleman', 'world', 'measure', 'change', 'hope', 'deal', 'life', 'agree', 'welcome', 'trade', 'committee', 'give', 'clear', 'opportunity', 'able', 'family', 'council', 'future', 'including', 'forward', 'continue', 'every', 'done', 'constituent', 'pay', 'already', 'social', 'amendment', 'pandemic', 'month', 'plan', 'northern', 'even', 'end', 'given', 'great', 'ireland', 'set', 'whether', 'funding', 'absolutely', 'eu', 'economy', 'still', 'united', 'department', 'two', 'impact', 'scotland', 'law', 'review', 'parliament', 'power', 'report', 'matter', 'authority', 'billion', 'chancellor', 'international', 'decision', 'act', 'british', 'woman', 'mean', 'action', 'concern', 'money', 'crisis', 'order', 'policy', 'worker', 'however', 'taking', 'industry', 'risk', 'around', 'provide', 'lady', 'making', 'next', 'step', 'staff', 'agreement', 'fact']

---

Number of terms W =  40170


from pyspark.ml.clustering import LDA
# this is the second step I timed
#start_time2 = timeit.default_timer()

# set parameters of LDA model and fit data
lda = LDA(k=8, maxIter=7, optimizer='online', seed=4)
lda_model = lda.fit(speeches_df_w_features)


# two ways of measuring the likelihood of our data given our LDA model is using log likelihood and perplexity. 
# I used these two metrics to experiment with different model parameters

ll = lda_model.logLikelihood(speeches_df_w_features)

lp = lda_model.logPerplexity(speeches_df_w_features)

print("The lower bound on the log likelihood of the entire corpus: " + str(ll))

print("The upper bound on the perplexity: " + str(lp))
#elapsed_2 = timeit.default_timer() - start_time2
#print(elapsed_2)

The lower bound on the log likelihood of the entire corpus: -38825971.032892466
The upper bound on the perplexity: 8.419027683085684


# Describe topics
topics = lda_model.describeTopics(20)

print("The topics described by their top-weighted terms:")
topics.show()

# Below I print out the top words in each topic
import numpy as np
topic_i = topics.select("termIndices").rdd.map(lambda r: r[0]).collect()
for i in topic_i:
    print(np.array(cv_model.vocabulary)[i])

The topics described by their top-weighted terms:
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[78, 15, 18, 37, ...|[0.00401786159253...|
|    1|[6, 32, 55, 42, 1...|[0.00355062138291...|
|    2|[64, 11, 17, 16, ...|[0.00254543662537...|
|    3|[17, 6, 16, 36, 2...|[9.87347133558917...|
|    4|[103, 84, 95, 16,...|[0.00274249636086...|
|    5|[43, 25, 57, 119,...|[0.00553972347817...|
|    6|[11, 1, 6, 0, 41,...|[9.21051324176059...|
|    7|[7, 160, 8, 103, ...|[9.23446147101310...|
+-----+--------------------+--------------------+

['woman' 'prime' 'measure' 'every' 'national' 'decision' 'lockdown' 'like'
 'day' 'thank' 'virus' 'family' 'restriction' 'system' 'life'
 'international' 'month' 'going' 'welcome' 'number']
['sector' 'council' 'funding' 'social' 'tax' 'economy' 'billion' 'family'
 'system' 'chancellor' 'pay' 'area' 'crisis' 'fund' 'welcome' 'plan'
 'building' 'measure' 'department' 'pandemic']
['scotland' 'thing' 'world' 'gentleman' 'life' 'national' 'prime' 'like'
 'thank' 'course' 'day' 'number' 'northern' 'working' 'look' 'clear'
 'police' 'question' 'united' 'going']
['world' 'sector' 'gentleman' 'continue' 'opportunity' 'england' 'course'
 'united' 'financial' 'act' 'regulation' 'including' 'change' 'census'
 'funding' 'around' 'ireland' 'however' 'department' 'report']
['police' 'order' 'next' 'gentleman' 'crime' 'safe' 'council' 'officer'
 'suspended' 'participating' 'thank' 'day' 'motion' 'force' 'system'
 'amendment' 'three' 'tax' 'able' 'change']
['amendment' 'trade' 'eu' 'clause' 'committee' 'agreement' 'act' 'deal'
 'law' 'regulation' 'ireland' 'parliament' 'northern' 'question' 'end'
 'provision' 'european' 'order' 'market' 'food']
['thing' 'thank' 'sector' 'like' 'already' 'clear' 'gentleman' 'amendment'
 'able' 'hope' 'forward' 'national' 'measure' 'life' 'including' 'prime'
 'tax' 'future' 'vaccine' 'going']
['system' 'vaccine' 'number' 'police' 'change' 'lady' 'possible' 'trade'
 'hope' 'deal' 'going' 'month' 'welcome' 'opportunity' 'sure' 'law' 'like'
 'plan' 'absolutely' 'making']


transformed = lda_model.transform(speeches_df_w_features)
transformed.show(3)

+-----+--------------------+--------------------+--------------------+
|dummy|               words|            features|   topicDistribution|
+-----+--------------------+--------------------+--------------------+
|    1|[behalf, whole, w...|(40170,[14,17,22,...|[8.59581364817453...|
|    1|[whether, plan, i...|(40170,[46,53,55,...|[0.01417611107083...|
|    1|[mr, speaker, ass...|(40170,[29,37,55,...|[0.00402913012389...|
+-----+--------------------+--------------------+--------------------+
only showing top 3 rows


from pyspark.sql.functions import col
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

# in this block I add a date column to the dataframe and divide the topic distribution arrays into their own columns 
# in order to analyze the distribution of topics over time

# create a spark dataframe and add a row_index column
speeches_df=speeches_df.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

# these are the variables we have access to aside from the text data
speeches_df.show(3)

# add date column to df
transformed = transformed.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
transformed_date = transformed.join(speeches_df, on=["row_index"]).drop("speech")

# expand topic distribution column into a column for each topic in vector
col_exploded = (transformed_date.withColumn("xs", vector_to_array("topicDistribution")).
                select(["date"] + ["Party"] + ["words"] + [col("xs")[i] for i in range(8)]))

+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
|              speech|person_id|first_name|last_name|      date|       Party|     Constituency|row_index|
+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
|On behalf of the ...|    10295|   Lindsay|    Hoyle|2020-01-07|     Speaker|          Chorley|        1|
|Whether he plans ...|    25886|      Mark|    Logan|2020-01-07|Conservative|Bolton North East|        2|
|First, Mr Speaker...|    24854|     Sajid|    Javid|2020-01-07|Conservative|       Bromsgrove|        3|
+--------------------+---------+----------+---------+----------+------------+-----------------+---------+
only showing top 3 rows


import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pyspark.sql.functions as f
import pandas as pd

# group by week and get mean of topic Distribution
time_df = (col_exploded.withColumn("week_strt_day", f.date_sub(f.next_day(col("date"),"sunday"),7))
                   .groupBy("week_strt_day")
                   .mean("xs[0]", "xs[1]", "xs[2]", "xs[3]", "xs[4]", "xs[5]", "xs[6]", "xs[7]"))

time_df = (time_df
       .filter(col("week_strt_day").isNotNull())
       .select(
        f.to_date("week_strt_day", "yyyy-mm-dd").alias("Date"),
        "avg(xs[0])",
        "avg(xs[1])",
        "avg(xs[2])",
        "avg(xs[3])",
        "avg(xs[4])",
        "avg(xs[5])",
        "avg(xs[6])",
        "avg(xs[7])")
       .orderBy("Date")
       .toPandas()
      )

# topic 5 is about the eu, northern ireland, and trade and we can see from this chart that it clearly peaks 
# in January 2020 which is when the Brexit agreement was signed including the Northern Ireland protocol. 

time_df.plot.line(x="Date", y="avg(xs[5])", label = "Brexit topic", ylabel = "Mean Topic Distribution")


# topics 0 and 7 on covid

time_df.plot.line(x="Date", y="avg(xs[0])", label = "COVID-19 Topic # 1", ylabel = "Mean Topic Distribution")

time_df.plot.line(x="Date", y="avg(xs[7])", label = "COVID-19 Topic # 2", ylabel = "Mean Topic Distribution")


# topic 1 is on the economy and government spending. There is a peak in March 2021 when 
# Chancellor of the Exchequer Rishi Sunak announced major changes to the COVID spending
# See: https://www.twobirds.com/en/news/articles/2020/uk/uk-government-financial-package-for-covid-19-relief. 

time_df.plot.line(x="Date", y="avg(xs[1])", label = "Economy Topic", ylabel = "Mean Topic Distribution")

<AxesSubplot:xlabel='Date', ylabel='Mean Topic Distribution'>


# group rows by Party to get averages of topic distributions within Parties

main_party_list = ['Conservative', 'Labour', 'Scottish National Party', 'Liberal Democrat', 'Green']


party_df = (col_exploded.groupBy("Party")
            .mean("xs[0]", "xs[1]", "xs[2]", "xs[3]", "xs[4]", "xs[5]", "xs[6]", "xs[7]")
            .filter(col("Party").isin(main_party_list))
            .toPandas())

# compare parties in terms of how much they talk about the Scotland/Northern England topic. As we would expect,
# the Scottish National Party speaks most about this topic according to the model. 
party_df.plot.bar(x="Party", y="avg(xs[2])", label = "Scotland/Northern England Topic", ylabel = "Mean Topic Distribution")

<AxesSubplot:xlabel='Party', ylabel='Mean Topic Distribution'>


train, test = speeches_df_w_features.randomSplit([9.0, 1.0], 24)

import numpy as np
from time import time
import random
import matplotlib.pyplot as plt
%matplotlib inline

def measure_speed(lda, train, test, stage):
    testPerplexity = np.zeros(stage)
    computationTime = np.zeros(stage)
    fracs = np.linspace(0.1, 1.0, stage)
    num_repeat = 5
    for j in range(num_repeat):
        for i in range(stage):
            corpus = train.sample(False, fracs[i], 10*j)
            # Batch variational inference
            t0 = time()
            lda_model = lda.fit(corpus)
            t1 = time()
            computationTime[i] += (t1 - t0)/num_repeat
            testPerplexity[i] += lda_model.logPerplexity(test)/num_repeat
    return testPerplexity, computationTime

# set the seeds to have the same inital clustering
# batch LDA
test_perplexity_em, computation_time_em = measure_speed(LDA(k=8, maxIter=5, seed=123), 
                                                            train, test, 10)
# online LDA
test_perplexity_online, computation_time_online = measure_speed(LDA(k=8, maxIter=5, optimizer='online', seed=123), 
                                                                    train, test, 10)


plt.plot(computation_time_em, label = "em")
plt.plot(computation_time_online, label = "online")
plt.xticks(np.arange(0, 10, 1), [str(i+1) for i in np.arange(0, 10, 1)])
plt.xlabel("training data size *10%")
plt.ylabel("computation time")
plt.legend()
plt.show();


plt.plot(test_perplexity_em, label = "em")
plt.plot(test_perplexity_online, label = "online")
plt.xticks(np.arange(0, 10, 1), [str(i+1) for i in np.arange(0, 10, 1)])
plt.xlabel("training data size *10%")
plt.ylabel("testing preplexity")
plt.legend()
plt.show();


# number of partitions

train, test = speeches_df_w_features.randomSplit([9.0, 1.0], 24)

import numpy as np

# number of partitions to test
npart = [2,4,10,20,50]
stage = len(npart)
computationTime = np.zeros(stage)
fracs = np.linspace(0.1, 1.0, stage)

for i in range(0,len(npart)):
    train = train.repartition(npart[i]) # Repartition the RDDs
    test = test.repartition(npart[i])
    corpus = train.sample(False, fracs[i], 10)
    start_time = timeit.default_timer()
    lda = LDA(k=8, maxIter=5, optimizer='online', seed=4)
    lda_model = lda.fit(corpus)
    elapsed = timeit.default_timer() - start_time
    computationTime[i] = elapsed
    
# plotting the results
from matplotlib import pyplot as plt
%matplotlib inline

# Plot parameters
plt.plot(npart, computationTime, 'ko:') # Basic parameters
plt.xlabel("Number of partitions") # x-axis label
plt.ylabel("Query execution time") # y-axis label
plt.xlim([0,20]) # x-axis limits
#plt.ylim([12,14])
plt.show() # Show

Topic Modelling of UK Parliamentary Speeches Using Spark¶

Abstract¶

Table of Contents¶

1. Introduction¶

Apache Spark¶

Resilient Distributed Datasets (RDDs) and DataFrames¶

Topic Modelling Background¶

Latent Dirichlet Allocation¶

Dirichlet distribution: probability distribution over a simplex¶

Plate Notation¶

2. The Data¶

Source¶

Cleaning and pre-pre-processing¶

3. Computing Setup¶

4. Cleaing & Pre-processing¶

Stopwords¶

5. Modelling¶

Interpreting the Topics¶

6. Computational Efficiency¶

Expectation Maximisation vs Online LDA Algorithms¶

Computation time vs. training data size¶

Testing perplexity vs. training data size¶

Number of Partitions¶

Computation time vs. number of partitions¶

Number of worker nodes¶

7. Conclusion¶

8. References¶