-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScraper.py
82 lines (66 loc) · 3.55 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
class Scraper:
def __init__(self, webdriverpath):
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.headless = True
self.driver = webdriver.Chrome(webdriverpath, chrome_options=options)
self.driver.set_window_size(1120,550)
self.skip_ads = True
self.skip_commercial_sellers = True
self.skip_bidding = True
self.skip_reserved = True
def Scrape(self, url): # Scrapes a url through Selenium, uses a pattern to find listing details, and returns them.
driver = self.driver
driver.get(url)
listings = {}
original_count = len(driver.find_elements_by_class_name("mp-Listing--list-item"))
for element in driver.find_elements_by_class_name("mp-Listing--list-item"):
if ("mp-Listing--cas" in element.get_attribute('class')) and self.skip_ads:
continue
listing = {}
listing['title'] = element.find_element_by_css_selector('h3.mp-Listing-title').text
listing['description'] = element.find_element_by_css_selector('p.mp-Listing-description').text
url = element.find_element_by_class_name("mp-Listing-coverLink").get_attribute("href")
listing['price'] = element.find_element_by_class_name('mp-text-price-label').text
listing['url'] = url
listing['date'] = element.find_element_by_class_name("mp-Listing-date").text
listing['seller_name'] = element.find_element_by_class_name("mp-Listing-seller-name").text
try:
listing['seller_website'] = element.find_element_by_class_name("mp-Listing-sellerCoverLink").get_attribute('href')
except NoSuchElementException:
listing['seller_website'] = False
if self.skip_commercial_sellers and listing['seller_website'] != False:
continue
if self.skip_reserved and "Gereserveerd" in listing['price']:
continue
if self.skip_bidding and "Bieden" in listing['price']:
continue
listings[url] = listing
return listings, original_count-len(listings)
def SaveListings(self, listings, filename): # Saves listings to a file.
with open(filename, "r") as json_file:
listings_in_file = json.load(json_file)
listings_in_file.update(listings)
# Clear json file
json_file.close()
open(filename,"w").close()
with open(filename, "w") as json_file:
json.dump(listings_in_file, json_file)
json_file.close()
def CompareListingsToSavedListings(self, listings, filename): # This function finds "new listings" (those that have not been found before)
with open(filename, 'r') as json_file:
new_listings = {}
saved_listings = json.load(json_file)
json_file.close()
for listing in listings:
is_new = True
for saved_listing in saved_listings:
if listing == saved_listing:
is_new = False
if is_new:
new_listings[listing] = listings[listing]
return new_listings