@Vishy First I downloaded the stop words and wordnet using a code editor.
Next I split the code into steps so it is easier to debug
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
ADDITIONAL_STOPWORDS = ['covfefe']
def basic_clean(text):
wnl = nltk.stem.WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
text = (unicodedata.normalize('NFKD', text)
.encode('ascii', 'ignore')
.decode('utf-8', 'ignore')
.lower())
words = re.sub(r'[^\w\s]', '', text).split()
return [wnl.lemmatize(word) for word in words if word not in stopwords]
words = basic_clean(''.join(str(dataset['text'].tolist())))
bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:12]
bigrams_series.sort_values(inplace=True)
ax = plt.barh(bigrams_series, color='blue', width=.9, figsize=(12, 8))
plt.show()
test (1).pbix (31.6 KB)