# Module 4: PySpark Programming For Biginners


## Install and initialize

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession

# Create a Spark Context
sc = SparkSession.builder.master("local[*]").appName("Test").getOrCreate().sparkContext

## Python 
### Lambda function examples
In the demonstration, the **map** function takes a function and a list as parameters. It will apply the function to each element in the list to give a new list.

The **reduce** function takes a function and a list as parameters. It will apply the function repeatedly to the element in the list. It will aggregate values in a list to give a single value.

The result is converted to list for printint. 

In [None]:
# Square function
l1 = [1, 2, 3]
print(list(
    map(lambda x: x*x, l1)
    ))

# Split line to words, space as delimiter
lines = ['line 1 text', 'line 2 text']
print(list(
    map(lambda line: line.split(), lines)
    ))

# Split multi-line text into lines
paragraphs = [
''' paragraph 1
line 1''',
''' paragraph 2.
line 2'''
]
print(list(
    map(lambda paragraph: paragraph.split('\n'), paragraphs)
    ))

# Words to key-value pairs
words = ['apple', 'sheep', 'peach', 'wolf']
print(list(
    map(lambda word: (word, 1), words)
))

# Get first/second elements from pairs
pairs = [('apple', 1), ('sheep', 1), ('peach', 1), ('wolf', 1)]
print(list(
    map(lambda p: p[0], pairs)
))
print(list(
    map(lambda p: p[1], pairs)
))


In [None]:
from functools import reduce
#Reduce by add
l1 = [1, 3, 5, 7]
print(
    reduce(lambda a, b: a+b, l1)
)


## RDD
1. Numerical data manipulations

In [None]:
rdd = sc.parallelize([
    1, 2, 3, 4
])
squared = rdd.map(lambda x: x*x)
print('Original list:', squared.collect())  # Get all
print('First 3:', squared.take(3))  # Get the first three
print('First:', squared.first())
print('Count:', squared.count())
print('Sum:', squared.sum())
print('Mean:', squared.mean())
print('Minimum:', squared.min())
print('Maximum:', squared.max())


2. Text manipulations

In [None]:
# Data
text = '''word count from Wikipedia the free encyclopedia
the word count is the number of words in a document or passage of text Word counting may be needed when a text
is required to stay within certain numbers of words This may particularly be the case in academia legal
proceedings journalism and advertising Word count is commonly used by translators to determine the price for
the translation job Word counts may also be used to calculate measures of readability and to measure typing
and reading speeds usually in words per minute When converting character counts to words a measure of five or
six characters to a word is generally used Contents Details and variations of definition Software In fiction
In non fiction See also References Sources External links Details and variations of definition
This section does not cite any references or sources Please help improve this section by adding citations to
reliable sources Unsourced material may be challenged and removed
Variations in the operational definitions of how to count the words can occur namely what counts as a word and
which words don't count toward the total However especially since the advent of widespread word processing there
is a broad consensus on these operational definitions and hence the bottom line integer result
The consensus is to accept the text segmentation rules generally found in most word processing software including how
word boundaries are determined which depends on how word dividers are defined The first trait of that definition is that a space any of various whitespace
characters such as a regular word space an em space or a tab character is a word divider Usually a hyphen or a slash is too
Different word counting programs may give varying results depending on the text segmentation rule
details and on whether words outside the main text such as footnotes endnotes or hidden text) are counted But the behavior
of most major word processing applications is broadly similar However during the era when school assignments were done in
handwriting or with typewriters the rules for these definitions often differed from todays consensus
Most importantly many students were drilled on the rule that certain words don't count usually articles namely a an the but
sometimes also others such as conjunctions for example and or but and some prepositions usually to of Hyphenated permanent
compounds such as follow up noun or long term adjective were counted as one word To save the time and effort of counting
word by word often a rule of thumb for the average number of words per line was used such as 10 words per line These rules
have fallen by the wayside in the word processing era the word count feature of such software which follows the text
segmentation rules mentioned earlier is now the standard arbiter because it is largely consistent across documents and
applications and because it is fast effortless and costless already included with the application As for which sections of
a document count toward the total such as footnotes endnotes abstracts reference lists and bibliographies tables figure
captions hidden text the person in charge teacher client can define their choice and users students workers can simply
select or exclude the elements accordingly and watch the word count automatically update Software Modern web browsers
support word counting via extensions via a JavaScript bookmarklet or a script that is hosted in a website Most word
processors can also count words Unix like systems include a program wc specifically for word counting
As explained earlier different word counting programs may give varying results depending on the text segmentation rule
details The exact number of words often is not a strict requirement thus the variation is acceptable
In fiction Novelist Jane Smiley suggests that length is an important quality of the novel However novels can vary
tremendously in length Smiley lists novels as typically being between and words while National Novel Writing Month
requires its novels to be at least words There are no firm rules for example the boundary between a novella and a novel
is arbitrary and a literary work may be difficult to categorise But while the length of a novel is to a large extent up
to its writer lengths may also vary by subgenre many chapter books for children start at a length of about words and a
typical mystery novel might be in the to word range while a thriller could be over words
The Science Fiction and Fantasy Writers of America specifies word lengths for each category of its Nebula award categories
Classification	Word count Novel over words Novella to words Novelette to words Short story under words
In non fiction The acceptable length of an academic dissertation varies greatly dependent predominantly on the subject
Numerous American universities limit Ph.D. dissertations to at most words barring special permission for exceeding this limit
'''

In [None]:
# Split by new line character and parallelize
lines = sc.parallelize(text.split('\n'))
print("collection of lines:")
print(lines.collect())

# map to list of words
words = lines.map(lambda line: line.split())
print("\ncollection of words, not flattened:")
print(words.collect())

# Flap map to lower case words
words = lines.flatMap(lambda line: line.split()).map(lambda word: word.lower())
print("\ncollection of words, flattened:")
print(words.collect())
print('\ncounts of words')
counts = words.countByValue()
print(counts)

# words to pairs
pairs = words.map(lambda word: (word, 1))
print('\ncollection of pairs')
print(pairs.collect())

3. Key-value Pair manipulations

In [None]:
pairs = sc.parallelize([
    ('dog', 1), ('cat', 2), ('dog', 3), ('cat', 5), ('fish', 1)
])
print('Original count as pairs')
print(pairs.collect())

counts = pairs.countByKey()  # count the occurrence of keys
print('Count stats generated using countByKey, Python dict like results')
print(counts)

counts = pairs.reduceByKey(lambda a, b: a + b)  # sum the counts of each keys
print('Count stats generated using reduceByKey, pair like results')
print(counts.collect())

sortedByKey = pairs.sortByKey(ascending=False)
print('Top 3 sorted by key')
print(sortedByKey.take(3))

sortedByCounts = pairs.sortBy(lambda p: p[1], ascending=False)
print('Top 3 sorted by value (counts)')
print(sortedByCounts.take(3))
