-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathwebkit_crawl.js
executable file
·122 lines (90 loc) · 2.93 KB
/
webkit_crawl.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"use strict";
var sys = require("system");
var page = require("webpage").create();
console.error = function () {
sys.stderr.write(Array.prototype.join.call(arguments, ' ') + '\n');
};
console.log = function() {
sys.stderr.write(Array.prototype.join.call(arguments, ' ') + '\n');
};
phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
if (sys.args.length < 3) {
console.error("Usage: " + sys.args[0] + " <url> <keyword> [timeout = 120]");
phantom.exit(-1);
}
var url = sys.args[1];
var keyword = sys.args[2];
var timeout = sys.args.length >= 4 ? sys.args[3] : 120;
var url_no_schema = url;
try {
url_no_schema = url.match(/https?\:\/\/(.+)/)[1];
}
catch (e) {
console.error("无法从 URL `" + url + "' 中提取不带协议前缀的网址");
phantom.exit(-2);
}
setTimeout(function() {
console.log("等待超时,退出(timeout = " + timeout + "s)");
phantom.exit(-5);
}, timeout * 1000);
page.onInitialized = function() {
};
page.onLoadStarted = function() {
};
page.onLoadFinished = function() {
/// 页面载入完成,解析页面并做处理
/// 只有当前页面的 window.location.href 中包括 url_no_schema,且源码包含关键字的时候,才返回结果
console.log("页面载入完成: " + Array.prototype.join.call(arguments, ' '));
console.log("开始解析页面");
var rect = page.evaluate(function() {
return {
href: window.location.href,
innerHTML: document.body.innerHTML
};
});
console.log("页面解析完成");
/// 1. 判断 window.location.href 是否包含 url_no_schema
if (rect.href.indexOf(url_no_schema) < 0) {
console.log("当前页面的 URL `" + rect.href + "' 不包含我们需要的网址, 跳过");
return;
}
/// 2. 判断 HTML 代码中有没有关键字
if (rect.innerHTML.indexOf(keyword) < 0) {
console.log("当前页面的源码中没有关键字 `" + keyword + "',跳过");
return;
}
console.log("在页面中发现了关键字 `" + keyword + "',返回当前页面的源码并退出");
sys.stdout.write(rect.innerHTML);
page.close();
phantom.exit(0);
};
page.onUrlChanged = function() {
console.log("URL 改变: " + Array.prototype.join.call(arguments, ' '));
};
page.onNavigationRequested = function() {
};
page.onRepaintRequested = function() {
};
page.onClosing = function() {
phantom.exit(0)
};
page.onConsoleMessage = function() {
};
page.onAlert = function() {
};
page.onConfirm = function() {
};
page.onPrompt = function() {
};
page.open(url, function() {
});