-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_translate.py
201 lines (144 loc) · 5.87 KB
/
scrape_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#! /usr/bin/env python
import urllib2
import urllib
import json
from bs4 import BeautifulSoup
langCode={
"afrikaans":"af",
"albanian":"sq",
"arabic":"ar",
"azerbaijani":"az",
"basque":"eu",
"bengali":"bn",
"belarusian":"be",
"bulgarian":"bg",
"catalan":"ca",
"chinese Simplified":"zh-CN",
"chinese Traditional":"zh-TW",
"croatian":"hr",
"czech":"cs",
"danish":"da",
"dutch":"nl",
"english":"en",
"esperanto":"eo",
"estonian":"et",
"filipino":"tl",
"finnish":"fi",
"french":"fr",
"galician":"gl",
"georgian":"ka",
"german":"de",
"greek":"el",
"gujarati":"gu",
"haitian Creole":"ht",
"hebrew":"iw",
"hindi":"hi",
"hungarian":"hu",
"icelandic":"is",
"indonesian":"id",
"irish":"ga",
"italian":"it",
"japanese":"ja",
"kannada":"kn",
"korean":"ko",
"latin":"la",
"latvian":"lv",
"lithuanian":"lt",
"macedonian":"mk",
"malay":"ms",
"maltese":"mt",
"norwegian":"no",
"persian":"fa",
"polish":"pl",
"portuguese":"pt",
"romanian":"ro",
"russian":"ru",
"serbian":"sr",
"slovak":"sk",
"slovenian":"sl",
"spanish":"es",
"swahili":"sw",
"swedish":"sv",
"tamil":"ta",
"telugu":"te",
"thai":"th",
"turkish":"tr",
"ukrainian":"uk",
"urdu":"ur",
"vietnamese":"vi",
"welsh":"cy",
"yiddish":"yi",
}
def fromHtml(text, languageFrom, languageTo):
"""
Returns translated text that is scraped from Google Translate's HTML
source code.
"""
#Set the user agent.
urllib.FancyURLopener.version = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008070400 SUSE/3.0.1-0.1 Firefox/3.0.1"
#Encode the parameters we're going to send to the Google servers.
try:
postParameters = urllib.urlencode({"langpair":"%s|%s" %(langCode[languageFrom.lower()],langCode[languageTo.lower()]), "text":text,"ie":"UTF8", "oe":"UTF8"})
except KeyError, error:
print "Currently we do not support %s" %(error.args[0])
return
#Send the request with the above parameters and save to 'page' variable.
page = urllib.urlopen("http://translate.google.com/translate_t", postParameters)
#content now contains the HTML source code of the website.
print page
content = page.read()
#Don't forget to close the connection!
page.close()
#content now contains the HTML source code of the website.
content = page.read()
htmlSource = BeautifulSoup(content)
#Google creates a span with title the same as the text you wanted to translate.
#So let's find a 'span' that has as a Title the 'text' we passed to this method.
translation = htmlSource.find('span', title=text )
#the renderContents() method returns the body that is inside of the span we found.
return translation.renderContents()
def fromAjax(text, languageFrom, languageTo):
"""
Returns a simple string translating the text from "languageFrom" to
"LanguageTo" using Google Translate AJAX Service.
"""
LANG = langCode
base_url = 'http://ajax.googleapis.com/ajax/services/language/translate?'
langpair = '%s|%s'%(LANG.get(languageFrom.lower(),languageFrom),
LANG.get(languageTo.lower(),languageTo))
try:
params=urllib.urlencode( (('v',1.0),
('q',text.encode('utf-8')),
('langpair',langpair),) )
# print params
except UnicodeDecodeError:
pass
url = base_url+params
content = urllib2.urlopen(url).read()
try: trans_dict=json.loads(content)
except AttributeError:
try: trans_dict=json.load(content)
except AttributeError: trans_dict=json.read(content)
try:
return trans_dict['responseData']['translatedText']
except TypeError: pass
############################################################################################
# Main treat:
files = ["animals.txt", "adjectives.txt"]
for File in files:
words = []
with open(File, 'r') as infile:
for line in infile:
words.append(line)
infile.close()
with open(File, 'a') as outfile:
for language in langCode:
for word in words:
word = str(word)
transWord = str(fromAjax(word, "english", language))
print word + 'is ' + transWord + ' in ' + language
if transWord != None and transWord != 'None':
outfile.write(transWord + '\n')
print '\n'
else: print "Did not write!" + '\n'
outfile.close()