This repository has been archived by the owner on Sep 11, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathconvert_cached_copy_to_feed.py
177 lines (153 loc) · 5.87 KB
/
convert_cached_copy_to_feed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/python2
#
# Copyright 2007 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code is not supported by Google
#
"""This script helps to migrate data from one appliance to another.
You need to compile a list of source urls (e.g. via export urls from crawl
diagnostics or export all urls).
IMPORTANT NOTES:
- the script will only export the content, not any meta tags associated
- the script will only contain what the cached version contains
i.e. the truncated file if the original source was longer than 2.5 MB
- since the script uses the cached version, it assumes to have the default
stylsheet. In case you modified the chached version header in the
default stylesheet, you need to adjust the header scrapper
- the script will use one connection to the appliance at a time to
download the cached versions. This means that you will have less
serving bandwidth during the runtime of the script
- 1GB limit is not honored. You might have to manually split the output file.
TODO(mblume)
- add meta data via get the search results first with getfields
- parse xml and add these as metadata to the feed.
"""
import base64
import codecs
import getopt
import sys
import urllib
import urllib2
import zlib
import HTMLParser
import xml.etree.cElementTree as ElementTree
from xml.sax.saxutils import quoteattr
#
# constants for the script
# NOTE: you should modify the <datasource>-tag content
#
# the xml header for the content feed
feed_header_text = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE gsafeed PUBLIC "-//Google//DTD GSA Feeds//EN" "">
<gsafeed>
<header>\
<datasource>convert_cached_copy_to_feed</datasource>
<feedtype>full</feedtype>
</header>
<group>\n"""
# the xml footer for the content feed
feed_footer_text = ' </group>\n</gsafeed>'
def DeflateAndBase64Encode(string_val):
"""zlib compress and base64 encode a string."""
zlibbed_str = zlib.compress(string_val)
return base64.b64encode(zlibbed_str)
def Usage():
"""Print script usage instructions on stdout."""
print """A python script to download the cached version from a GSA
and generate a feed that can be submitted to another GSA.
Usage: %s ARGS
--gsaname: hostname or ip of the source gsa,
e.g. search.company.com or 129.13.54.2
--urlfile: path and filename of the file containing all urls to download.
The format should be one url per line
--output: path and filename of the generated XML feed file
--help: output this message""" % sys.argv[0]
def main(argv):
#
# Parameters we are going to initialize via command line args
#
# URL that is being indexed by the appliance.
cached_url_file_name = None
# Hostname of the GSA/Mini.
appliance_hostname = None
# output file:
output_feed_file_name = None
try:
opts, args = getopt.getopt(argv[1:], None,
['help', 'gsaname=', 'urlfile=', 'output='])
except getopt.GetoptError:
# print help information and exit:
Usage()
sys.exit(2)
for opt, arg in opts:
if opt == '--help':
Usage()
sys.exit()
if opt == '--gsaname':
appliance_hostname = arg
if opt == '--urlfile':
cached_url_file_name = arg
if opt == '--output':
output_feed_file_name = arg
if appliance_hostname and cached_url_file_name and output_feed_file_name:
#
# Real work begins here.
#
try:
parser = HTMLParser.HTMLParser()
output_file = open(output_feed_file_name, 'w')
output_file.write(feed_header_text)
# get all cached urls:
cached_url_file = open(cached_url_file_name, 'r')
for url in cached_url_file:
cached_url = 'http://' + appliance_hostname + '/search?q=cache:'
cached_url += urllib.quote_plus(url.rstrip())
print 'Accessing URL - %s ' %cached_url
# since 7.0 (and possibly earlier), content is an XML document
# containing the cached content
content = urllib2.urlopen(cached_url).read()
gsp = ElementTree.fromstring(content)
content_type = gsp.findall('.//CACHE_CONTENT_TYPE')[0].text
blob = gsp.findall('.//BLOB')[0]
encoding = blob.get('encoding')
cache_response = codecs.decode(
base64.b64decode(blob.text), encoding)
if content_type == 'text/plain':
# the blob that comes back is wrapped in HTML; unwrap it
pre_body = cache_response[cache_response.find('<pre>') + len('<pre>'):
cache_response.rfind('</pre>')]
cached_content = parser.unescape(pre_body)
else:
cached_content = cache_response
compressed_cached_content = DeflateAndBase64Encode(
codecs.encode(cached_content, 'utf-8'))
# debug output------------------------------------
#print 'complete content from GSA is:\n%s' % cached_content
#print 'cached content is:\n%s' % compressed_cached_content
# end debug output --------------------------------
output_file.write(""" <record url=%s mimetype=%s>
<content encoding="base64compressed">%s</content>
</record>\n""" % (quoteattr(url.rstrip()), quoteattr(content_type),
compressed_cached_content))
except Exception, exception:
print 'Got exception: %s' %exception
sys.exit(1)
finally:
output_file.write(feed_footer_text)
else:
Usage()
sys.exit(1)
if __name__ == '__main__':
main(sys.argv)