-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnote.txt
263 lines (162 loc) · 8.53 KB
/
note.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
Main()
boolean success = job.waitForCompletion(true);
Job.java
waitForCompletion
if (state == JobState.DEFINE) {
submit();
}
public void submit() throws IOException, InterruptedException,
ClassNotFoundException {
ensureState(JobState.DEFINE);
setUseNewAPI();
// Connect to the JobTracker and submit the job
connect();
info = jobClient.submitJobInternal(conf);
super.setJobID(info.getID());
state = JobState.RUNNING;
}
JobClient.java
this.rpcJobSubmitClient =
createRPCProxy(JobTracker.getAddress(conf), conf);
this.jobSubmitClient = createProxy(this.rpcJobSubmitClient, conf);
status = jobSubmitClient.submitJob(
jobId, submitJobDir.toString(), jobCopy.getCredentials());
JobTracker.java
submitJob()
status = addJob(jobId, job);
(details from this step to next is in this site http://www.jiancool.com/article/8165570161/;jsessionid=DA095379D6634B03536D6B2FD3286707)
public void initJob(JobInProgress job)
job.initTasks();
JobinProgress.java->
TaskSplitMetaInfo[] splits = createSplits(jobId);
TaskSplitMetaInfo[] allTaskSplitMetaInfo =
SplitMetaInfoReader.readSplitMetaInfo(jobId, fs, jobtracker.getConf(),
jobSubmitDir);
SplitMetaInfoReader
mapreduce.jobtracker.split.metainfo.maxsize
read in SplitMetaInfo[]
construct TaskSplitIndex (split file path and start offset of a split)
construct TaskSplitMetaInfo (TaskSplitIndex, splits locations, split length)
maps[i] = new TaskInProgress(jobId, jobFile,
splits[i],
jobtracker, conf, this, i, numSlotsPerMap);
TaskInProgress.java
t = new MapTask(jobFile, taskid, partition, splitInfo.getSplitIndex(),
numSlotsNeeded);
MapTask.java
runNewMapper(job, splitMetaInfo, umbilical, reporter);
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
********************************************************************************************************
SampleSplit serialization
SampleSplit is written to file
most important SampleSplit meta including:
1. split serialization file path: JobSubmissionFiles.getJobSplitFile(jobSubmitDir).toString()
2. offset of a split in serialization file: recorded by SplitWriter
3. splits locations: interface of InputSplit
4. split length: interface of InputSplit
/***********************************************************
directly generate samples for subset of whole data without pre-filtering out subset.
workload
how traditional mr do computations: draw digram
sendvalue for all weights
****************************************tiny image data set****
% valid field names:
% 1. keyword 0
% 2. filename 80
% 3. width 175
% 4. height 177
% 5. colors 179
% 6. date 183
% 7. engine 212
% 8. thumb_url 222
% 9. source_url 422
% 10. page 750
% 11. ind_page 754
% 12. ind_engine 758
% 13. ind_overall 762
% 14. label (1 = correct, 0 = incorrect, -1 = unlabelled) 766
******************************************************************************
efficient online
"mapreduce.input.fileinputformat.split.minsize.per.node"
"mapreduce.input.fileinputformat.split.minsize.per.rack"
"mapreduce.input.fileinputformat.split.maxsize"
clusterID = ((NewTrackingRecordReader)reader).getRecordReader();
ssh -N -f -L 54444:compute-0-0:8080 [email protected]
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -s 100000 -w 0=PushEvent -t github -i /2015-01.json -o /github
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.index.IndexUserVisit -s 10000 -f 2-4-5 -t uservisit -i /test/uservist.test
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.index.IndexGitHub -s 10000 -f 1 -t githuborg -i /2015-01.json
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -d -a avg -s 100000 -w payload.issue.created_at=2015-01-0 -t github10k -i /2015-01.json -o /github/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -d -a total -r 0.2 -w payload.issue.created_at=2015-01-0,org=true -t github1k -i /2015-01.json -o /github/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -d -a total -s 200000 -w type=IssueCommentEvent,org=true -t github10k -i /2015-01.json -o /github/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -a total -p -i /2015-01.json -o /github/tmp
**************************amazon precise******
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -a total -w salesRank=Music -p -i /sortybytime -o /amazon/tmp
***************************amazon approx*******************
size
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -d -a total -s 500000 -w salesRank=Books -t amazon10k -i /sortybytime -o /amazon/tmp
size with block unit
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -d -b -a avg -s 200000 -w salesRank=Books,reviewTime=2000 -t amazon -i /sortybytime -o /amazon/tmp
ratio
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -d -a avg -r 0.1 -w salesRank=Books,reviewTime=2000 -t amazon -i /sortybytime -o /amazon/tmp
error
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -a avg -e 0.01 -c 0.95 -w salesRank=Books,reviewTime=2000 -t amazon -i /sortybytime -o /amazon/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.index.IndexAmazon -s 10000 -f 0-2 -t amazon10k -i /sortybytime
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -d -a total -q -r 100 -z -w salesRank=Music -t amazon10k -i /sortybytime -o /amazon/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -a total -s 100 -w salesRank=Music -t amazon1k -i /sortybytime -o /amazon/tmp -m=
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -e 0.03 -c 0.95 -a total -w salesRank=Music -t amazon1k -i /sortybytime -o /amazon/tmp
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.GitHubEvent -a total -r 1.0 -w payload.issue.created_at=2015-01-0 -t github10k -i /2015-01.json -o /github/tmp -m 33554432
hadoop jar hadoop-SubsetApprox-1.2.1.jar org.apache.hadoop.mapreduce.approx.app.AmazonReview -a total -r 1.0 -w salesRank=Music -t amazon10k -i /sortybytime -o /amazon/tmp -m 33554432
def printType1(file):
a = json.loads(file.readline())
print(a['created_at'])
return a
def printType3(file):
a = json.loads(file.readline())
print(a['type'])
if str(a['type']) =="IssueCommentEvent":
print(a['payload']['issue']['created_at'])
return a
push event
['payload']['size']
['org'] true or false
issuecomments event
['payload']['issue']['comments']
pullrequest
c['payload']['pull_request']['additions']
fork
['payload']['forkee']['size']
[][]['watchers'] watchers_count
issue event
['payload']['issue']['comments']
33GB, 11:46:43----11:49:55
precise 228s
15/09/30 21:15:09 INFO mapred.JobClient: Map output records=6841398
2015-09-30 21:11:21
approximate 20s
15/09/30 21:50:49 INFO input.FileInputFormat: Total input paths to process : 1
15/09/30 21:51:09 INFO mapred.JobClient: Map output records=154497
************************
small data
**********************
github
repository id (storage cost experiment)
issuecomments issue date
type
org
1000, 10000, 100000
amazon
reviewTime
categories: shoes, Music
payload.issue.created_at=2015-01-0,type=IssueCommentEvent
67108864
segments list need to be random order
pps degree--> variance
on small data, quite clustered.
2015-10-15 18:30:02,741 INFO Subset.Reducer:segments:78174
2015-10-15 18:30:02,764 INFO Subset.Reducer:total:4.4334908974294525E8
2015-10-15 18:30:02,771 INFO Subset.Reducer:var:8.504869476324487E12
2015-10-15 18:30:02,772 INFO Subset.Reducer:error:5715969.4348595105
2015-10-15 18:30:02,820 INFO Subset.Reducer:deff_p:0.9999999999999827
2015-10-15 18:30:02,820 INFO Subset.Reducer:totalSize:3603756