forked from justmarkham/DAT4
-
Notifications
You must be signed in to change notification settings - Fork 0
/
17_map_reduce.py
91 lines (71 loc) · 2.36 KB
/
17_map_reduce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 18
@author: sinanozdemir
adapted from https://www.wakari.io/sharing/bundle/nkorf/MapReduce%20Example
"""
import re
def mapper(line, queue = None):
result = []
# remove leading and trailing whitespace
line = line.strip()
# remove odd symbols from the text
line = re.sub('[!"§$%&/()=?*#()\[\],.<>:;~_-]',"", line)
# split the line into words
words = line.split(" ")
# insert the cleaned words into the results list
for word in words:
result.append((word, 1))
if queue:
queue.put(result)
return result
# output is a list of (key, value) pairs
mapper("Hi everyone Hi Hi")
# [('Hi', 1), ('everyone', 1), ('Hi', 1), ('Hi', 1)]
# note that duplicates are expected
# the reducer function is very simple! All it will do is sup up similar values from sorted key value pairs
def reducer(key, values):
print "Reducer result -> %s : %d" % (key, sum(values))
# the shuffle function gathers up the like key words
# once it gathers them up, it calls the reduce function!
def shuffle(words):
# sorting the words
sorted_keys = sorted(words)
tmp=""
val_list=[]
for i in sorted_keys:
if i[0]!=tmp and tmp!="":
print tmp, val_list
reducer(tmp,val_list)
val_list=[]
tmp=i[0]
val_list.append(i[1])
elif i[0]==tmp or tmp=="":
tmp=i[0]
val_list.append(i[1])
# get the last key value pair
print tmp, val_list
# now reduce the new key value pair
reducer(tmp,val_list)
shuffle([('Hi', 1), ('everyone', 1), ('Hi', 1), ('Hi', 1)])
sentences = ['hello big data big big big data ',
'big data is the best',
'big data is the best data big',
'hello big data how are data',
'big big big data',
'data data big big']
# list of sentences to analyze
# get the first sentence
first_sentence = sentences[0]
# map the first sentence
mapper(first_sentence)
#send the mapped sequence to the shuffler/reducer
shuffle(mapper(first_sentence))
# now do it for all of the sentences one by one
output_map =[]
for sentence in sentences:
output_map +=mapper(sentence)
# total (key: value) pairs in one list
output_map
# call the shuffle function, which also calls the reduce function
shuffle(output_map)