Implementing N-Grams + Python + Power BI

AntrikshSharma · February 20, 2023, 8:23am

@Vishy First I downloaded the stop words and wordnet using a code editor.

Next I split the code into steps so it is easier to debug

import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt

ADDITIONAL_STOPWORDS = ['covfefe']

def basic_clean(text):
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

words = basic_clean(''.join(str(dataset['text'].tolist())))
bigrams_series = (pd.Series(nltk.ngrams(words, 2)).value_counts())[:12]
bigrams_series.sort_values(inplace=True)
ax = plt.barh(bigrams_series, color='blue', width=.9, figsize=(12, 8))
plt.show()

test (1).pbix (31.6 KB)