-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapper.py
158 lines (157 loc) · 13.4 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import requests
from bs4 import BeautifulSoup
import re
import sys
# scrapper() is a function that scrape the elements of the list that contains the informations about rocket engines. This function return two tuples; the header of the table and the content of the tables.
def scrapper():
le7a = 20
nk33a = 27
helix = 17
print("Scrapping engines data...")
# get the html from scrapper
link = 'https://en.wikipedia.org/wiki/Comparison_of_orbital_rocket_engines'
html = requests.get(link)
soup = BeautifulSoup(html.text, 'html.parser')
# get and format the first table (contain new rocket engines) titles from html
titles_bs = soup.find_all('table')[0].find_all('th')
titles = ()
for i in range(len(titles_bs)):
title = re.sub("\[.*?\]","[]",titles_bs[i].get_text().replace("\n", "").replace("\u200b", "").replace(" ", " ")).replace("[]", "")
if title == 'Specific impulse (s)':
titles += tuple(map(str, ['Specific impulse Vac (s)']))
titles += tuple(map(str, ['Specific impulse SL (s)']))
elif title == 'Thrust (N)':
titles += tuple(map(str, ['Thrust Vac (N)']))
titles += tuple(map(str, ['Thrust SL (N)']))
else:
titles += tuple(map(str, [title]))
# get the table rows
rows_bs = soup.find_all('table')[0].find_all('tr')
data = []
for i in range(len(rows_bs)):
# find all the elements from every row
element = rows_bs[i].find_all('td')
row = ()
for j in range(len(element)):
# define the string_element as a cell from the table [It will get every element as string even if it's a number]
if titles[j] == 'Origin' or titles[j] == 'Vehicle' or titles[j] == 'Use':
string_element = re.sub("\[.*?\]","[]", element[j].get_text().replace("\u200b", "").replace("\u2009", "").replace("\xa0", "").replace(" ", " ")).replace("[]", "").replace("est.", "").replace("~", "").replace(">", "").replace("<", "").replace("\n", " ").removesuffix("\n")
else:
string_element = re.sub("\[.*?\]","[]", element[j].get_text().replace("\u200b", "").replace("\u2009", "").replace("\xa0", "").replace(" ", " ")).replace("[]", "").replace(",", "").replace("est.", "").replace("~", "").replace(">", "").replace("<", "").removesuffix("\n")
# Clean the titles and the strings from unwanted elements
if string_element.endswith('11Д521') or string_element.endswith('8Д420'):
string_element = string_element[0:6]
if string_element.find('Д') != -1 or string_element.find('8D6') != -1 or string_element.find('11D2') != -1 or string_element.find('11D5') != -1 or string_element.find('15D1') != -1 or string_element.find('15D3') != -1:
string_element = string_element.split()[0]
# Split the specific impulse column into Isp in the vaccum and Isp in the sea level
if titles[j] == 'Specific impulse Vac (s)' and (len(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()) > 1 or i in [helix, nk33a]):
# Change the Isps intervals to medians
if string_element.find('–') != -1:
row += tuple(map(float, [(float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[1])) / 2, (float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[1])) / 2]))
elif i == helix:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "")[0:3]), float(re.sub("\(.*?\)","()", string_element).replace("()", "")[2:6])]))
elif i == nk33a:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ")[0:3]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ")[3:6])]))
else:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1])]))
elif titles[j] == 'Specific impulse Vac (s)' and string_element[string_element.find("(")+1:string_element.find(")")] == 'Vac':
row += tuple(map(float, [float(string_element.split(" ")[0])]))
row += tuple(map(str, ['']))
# Split the thrust column into Thrust in the vaccum and Thrust in the sea level
elif titles[j] == 'Specific impulse SL (s)' and (len(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()) > 1 or i == le7a):
# Change Thrusts intervals to medians
if string_element.find('–') != -1:
row += tuple(map(float, [(float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[1])) / 2, (float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[1])) / 2]))
elif i == le7a:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ")[0:7]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ")[7:13])]))
else:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1])]))
# Change numbers from strings to floats
elif string_element.replace("(SL)", "").strip().replace("–", "").replace(".", "").isnumeric() and titles[j] != 'Thrust SL (N)':
# Change intervals to median
if string_element.find('–') != -1:
row += tuple(map(float, [(float(string_element.replace("(SL)", "").replace("with fuel", "").split('–')[0]) + float(string_element.replace("(SL)", "").replace("with fuel", "").split('–')[1])) / 2]))
else:
row += tuple(map(float, [float(string_element.replace("(SL)", "").replace("with fuel", ""))]))
if titles[j] == 'Specific impulse Vac (s)' or titles[j] == 'Specific impulse SL (s)':
row += tuple(map(str, [""]))
else:
# assigning the string element from the table to a value in the row
row += tuple(map(str, [string_element]))
if titles[j] == 'Specific impulse Vac (s)' or titles[j] == 'Specific impulse SL (s)':
row += tuple(map(str, [""]))
print(f"{int((i/len(rows_bs)) * 50)}% done")
# append the data with the newly made row if it is not empty
if row != ():
data.append(row)
# get the table rows
titles_bs = soup.find_all('table')[0].find_all('th')
titlesr = ()
for i in range(len(titles_bs)):
title = re.sub("\[.*?\]","[]",titles_bs[i].get_text().replace("\n", "").replace("\u200b", "").replace(" ", " ")).replace("[]", "")
if title == 'Specific impulse (s)':
titlesr += tuple(map(str, ['Specific impulse Vac (s)']))
titlesr += tuple(map(str, ['Specific impulse SL (s)']))
elif title == 'Thrust (N)':
titlesr += tuple(map(str, ['Thrust Vac (N)']))
titlesr += tuple(map(str, ['Thrust SL (N)']))
else:
titlesr += tuple(map(str, [title]))
rows_bs = soup.find_all('table')[1].find_all('tr')
for i in range(len(rows_bs)):
# find all the elements from every row
element = rows_bs[i].find_all('td')
row = ()
for j in range(len(element)):
# define the string_element as a cell from the table [It will get every element as string even if it's a number]
if titlesr[j] == 'Origin' or titlesr[j] == 'Vehicle' or titlesr[j] == 'Status':
string_element = re.sub("\[.*?\]","[]", element[j].get_text().replace("\u200b", "").replace("\u2009", "").replace("\xa0", "").replace(" ", " ")).replace("[]", "").replace("est.", "").replace("~", "").replace(">", "").replace("<", "").replace("\n", " ").removesuffix("\n")
else:
string_element = re.sub("\[.*?\]","[]", element[j].get_text().replace("\u200b", "").replace("\u2009", "").replace("\xa0", "").replace(" ", " ")).replace("[]", "").replace(",", "").replace("est.", "").replace("~", "").replace(">", "").replace("<", "").removesuffix("\n")
# Clean the titles and the strings from unwanted elements
if string_element.endswith('11Д521') or string_element.endswith('8Д420'):
string_element = string_element[0:6]
if string_element.find('Д') != -1 or string_element.find('8D6') != -1 or string_element.find('11D2') != -1 or string_element.find('11D5') != -1 or string_element.find('15D1') != -1 or string_element.find('15D3') != -1:
string_element = string_element.split()[0]
# Split the specific impulse column into Isp in the vaccum and Isp in the sea level
if titlesr[j] == 'Power cycle' and len(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()) > 1:
# Change the Isps intervals to median
if string_element.find('–') != -1:
row += tuple(map(float, [(float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[1])) / 2, (float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[1])) / 2]))
else:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1])]))
# Split the thrust column into Thrust in the vaccum and Thrust in the sea level
elif titlesr[j] == 'Specific impulse Vac (s)' and len(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()) > 1:
# Change Thrusts intervals to medians
if string_element.find('–') != -1:
row += tuple(map(float, [(float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0].split('–')[1])) / 2, (float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[0]) + float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1].split('–')[1])) / 2]))
else:
row += tuple(map(float, [float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[0]), float(re.sub("\(.*?\)","()", string_element).replace("()", "").replace(" ", " ").split()[1])]))
# Change numbers from strings to floats
elif string_element.replace("(SL)", "").strip().replace("–", "").replace("-", "").replace(".", "").isnumeric() and titlesr[j] != 'Thrust Vac (N)':
# Change intervals to medians
if string_element.find('–') != -1:
row += tuple(map(float, [(float(string_element.replace("(SL)", "").replace("with fuel", "").split('–')[0]) + float(string_element.replace("(SL)", "").replace("with fuel", "").split('–')[1])) / 2]))
elif string_element.find('-') != -1:
row += tuple(map(float, [(float(string_element.replace("(SL)", "").replace("with fuel", "").split('-')[0]) + float(string_element.replace("(SL)", "").replace("with fuel", "").split('-')[1])) / 2]))
else:
row += tuple(map(float, [float(string_element.replace("(SL)", "").replace("with fuel", ""))]))
if titlesr[j] == 'Power cycle' or titlesr[j] == 'Specific impulse Vac (s)':
row += tuple(map(str, [""]))
else:
# assigning the string element from the table to a value in the row
row += tuple(map(str, [string_element]))
if titlesr[j] == 'Power cycle' or titlesr[j] == 'Specific impulse Vac (s)':
row += tuple(map(str, [""]))
print(f"{int((i/len(rows_bs)) * 50 + 50)}% done")
# append the data with the newly made row if it is not empty
if row != ():
if row[0] == 'P230':
row += ('', )
row = list(row)
row.insert(4, "Retired")
data.append(tuple(row))
# The function returns the titles and the data
print(f"100% done")
return titles, data
sys.modules[__name__] = scrapper