-
Notifications
You must be signed in to change notification settings - Fork 474
/
Copy pathpyspark-session-2019-10-16.txt
executable file
·91 lines (86 loc) · 2.34 KB
/
pyspark-session-2019-10-16.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
$ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
[Clang 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>>
>>> a =[ 1, 2, 3, 4, 5]
>>> rdd = spark.sparkContext.parallelize(a)
>>> rdd.collect()
[1, 2, 3, 4, 5]
>>> rdd.count()
5
>>> sumofvalues = rdd.reduce(lambda x, y: x+y)
>>> sumofvalues
15
>>>
>>> product = rdd.reduce(lambda x, y: x*y)
>>> product
120
>>> z = [ "1", "2", "3", "4", "5", "6", "7"]
>>> rdd = spark.sparkContext.parallelize(z)
>>> rdd.collect()
['1', '2', '3', '4', '5', '6', '7']
>>> concat = rdd.reduce(lambda x, y: x+y)
>>> concat
'1234567'
>>>
>>> [ "1", "2", "3", "4", "5", "6", "7"]
['1', '2', '3', '4', '5', '6', '7']
>>> z = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b"]
>>>
>>>
>>> z
['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
>>> rdd = spark.sparkContext.parallelize(z, 3)
>>> rdd.collect()
['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b']
>>> concat = rdd.reduce(lambda x, y: x+y)
>>> concat
'123456789ab'
>>> rdd = spark.sparkContext.parallelize(z, 10)
>>> concat = rdd.reduce(lambda x, y: x+y)
>>> concat
'123456789ab'
>>>
>>> nums = [1, 3, 5, 4, 2, 1, 0, 9, 10]
>>> nums
[1, 3, 5, 4, 2, 1, 0, 9, 10]
>>> rdd = spark.sparkContext.parallelize(nums)
>>> rdd.collect()
[1, 3, 5, 4, 2, 1, 0, 9, 10]
>>> rdd.count()
9
>>> sumvalues = rdd.reduce(lambda a, b: a+b)
>>> sumvalues
35
>>> product = rdd.reduce(lambda a, b: a*b)
>>> product
0
>>> nums = [1, 3, 5, 4, 2, 1, 30, 9, 10]
>>> rdd = spark.sparkContext.parallelize(nums)
>>> sumvalues = rdd.reduce(lambda a, b: a+b)
>>> sumvalues
65
>>> product = rdd.reduce(lambda a, b: a*b)
>>> product
324000
>>> rdd.collect()
[1, 3, 5, 4, 2, 1, 30, 9, 10]
>>> strs = ["1", "3", "5", "4", "2", "1"]
>>> strs
['1', '3', '5', '4', '2', '1']
>>> rdd = spark.sparkContext.parallelize(strs)
>>> concat = rdd.reduce(lambda a, b: a+b)
>>> concat
'135421'