Python Workout 12: Data Functions

Level of Difficulty:

Objective: This workout provides practice in creating functions that incorporate string manipulation, statistics, and data cleaning.

1. Given the following string, input_string, write a Python function that takes a string and removes all stop words (e.g. “the”, “a”, “an”) from it.

input_string = "The quick brown fox jumps over the lazy dog"

2. Given the following string, input_string, write a Python function that takes a string and performs stemming on it to reduce all words to their base form (e.g. “running” to “run”).

input_string = "I am running on the beach and feeling amazing"

3.Given the following numpy array, data, write a Python program to remove outliers using the Z-score method.

data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100])

Simply post your code and a screenshot of your results.

Please format your Python code and blur it or place it in a hidden section.

This workout will be released on Monday May 8, 2023, and the author’s solution will be posted on Sunday May 14, 2023.

Hello,
Here are my solutions:

Challenge 1

Summary
# Challenge 1
'''Given the following string, input_string, write a Python function that takes a string 
and removes all stop words (e.g. “the”, “a”, “an”) from it.'''

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

input_string = "The quick brown fox jumps over the lazy dog"

def remove_stop_words(sentense):
    stop_words = set(stopwords.words('english'))

    # Tokenize the sentence
    words = word_tokenize(input_string)
    
    filtered_words = [w for w in words if not w.lower() in stop_words]   
    filtered_sentense = " ".join(filtered_words)
    
    return filtered_sentense

remove_stop_words(input_string)

image

Challenge 2 (the result is not fully correct)

Summary
# Challenge 2
'''
Given the following string, input_string, write a Python function that takes a string and performs 
stemming on it to reduce all words to their base form (e.g. “running” to “run”).
'''

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import functools as ftools

input_string = "I am running on the beach and feeling amazing"

def stemming(sentense):
    ps = PorterStemmer()
    
    # Tokenize the sentence
    words = word_tokenize(input_string)
    
    stemming_func = lambda x,y: x + " " + ps.stem(y)
    
    # using reduce to apply stemmer to each word and join them back into a string
    stemmed_sentense = ftools.reduce(stemming_func, words)
    
    return stemmed_sentense

stemming(input_string)

image

Challenge 3

Summary
# Challenge 3
'''
Given the following numpy array, data, write a Python program to remove outliers using the Z-score method.
'''
import numpy as np
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100])

def remove_outliers(array):
    array_mean = array.mean()
    array_std = array.std()
    upper_limit = array_mean + 3*array_std
    lower_limit = array_mean - 3*array_std
    
    array = [num for num in array if num < upper_limit and num > lower_limit]
    
    return array

remove_outliers(data)

image

Thank you, that was quite challenging for me.

1 Like

Learnt a lot on this one. Really getting the handle of python lately

Challenge 1

def remove_stop_words(input_string):
    stop_words = ["the", "a", "an"]
    words = input_string.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    result = " ".join(filtered_words)
    return result

input_string = "The quick brown fox jumps over the lazy dog"
output_string = remove_stop_words(input_string)
print(output_string)

Challenge 2

from nltk.stem import PorterStemmer

def perform_stemming(input_string):
    stemmer = PorterStemmer()
    words = input_string.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    result = " ".join(stemmed_words)
    return result

input_string = "I am running on the beach and feeling amazing"
output_string = perform_stemming(input_string)
print(output_string)

Challenge 3

import numpy as np

def remove_outliers_zscore(data):
    z_scores = (data - np.mean(data)) / np.std(data)
    filtered_data = data[np.abs(z_scores) < 3]
    return filtered_data

data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100])
filtered_data = remove_outliers_zscore(data)
print(filtered_data)

1 Like

Very NLP-esque workout this time.

1. Given the following string, input_string, write a Python function that takes a string and removes all stop words (e.g. “the”, “a”, “an”) from it.

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stop_words(input_string):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(input_string)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

input_string = "The quick brown fox jumps over the lazy dog"
output_string = remove_stop_words(input_string)
print(output_string)

2. Given the following string, input_string, write a Python function that takes a string and performs stemming on it to reduce all words to their base form (e.g. “running” to “run”).

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def perform_stemming(input_string):
    stemmer = PorterStemmer()
    words = word_tokenize(input_string)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

input_string = "running dogs are happily running and playing"
output_string = perform_stemming(input_string)
print(output_string)

3. Given the following numpy array, data, write a Python program to remove outliers using the Z-score method.

import numpy as np

def z_outliers(data, threshold=3):
    z_scores = (data - np.mean(data)) / np.std(data)
    filtered_data = data[abs(z_scores) < threshold]
    return filtered_data