Skip to content

Commit

Permalink
wip - monster
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson committed Jul 22, 2024
1 parent 8570c06 commit ce831c8
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .scrapers.ziprecruiter import ZipRecruiterScraper
from .scrapers.glassdoor import GlassdoorScraper
from .scrapers.linkedin import LinkedInScraper
from .scrapers.monster import MonsterScraper
from .scrapers import SalarySource, ScraperInput, Site, JobResponse, Country
from .scrapers.exceptions import (
LinkedInException,
Expand Down Expand Up @@ -49,6 +50,7 @@ def scrape_jobs(
Site.INDEED: IndeedScraper,
Site.ZIP_RECRUITER: ZipRecruiterScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.MONSTER: MonsterScraper,
}
set_logger_level(verbose)

Expand Down
3 changes: 3 additions & 0 deletions src/jobspy/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ class Site(Enum):
INDEED = "indeed"
ZIP_RECRUITER = "zip_recruiter"
GLASSDOOR = "glassdoor"
MONSTER = "monster"


class SalarySource(Enum):
DIRECT_DATA = "direct_data"
DESCRIPTION = "description"


class ScraperInput(BaseModel):
site_type: list[Site]
search_term: str | None = None
Expand Down
193 changes: 193 additions & 0 deletions src/jobspy/scrapers/monster/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""
jobspy.scrapers.monster
~~~~~~~~~~~~~~~~~~~
This module contains routines to scrape Monster Jobs.
"""

from __future__ import annotations

import json
import math
import uuid

from concurrent.futures import ThreadPoolExecutor

from dateutil.parser import parse

from .. import Scraper, ScraperInput, Site
from ..utils import (
logger,
extract_emails_from_text,
create_session,
markdown_converter,
)
from ...jobs import (
JobPost,
Location,
JobResponse,
DescriptionFormat,
)


class MonsterScraper(Scraper):
base_url = "https://www.monster.com/job-openings/"
api_url = "https://appsapi.monster.io/profiles-native-apps-app-service/v3/jobs/search?languageTag=en-US&apikey=fLGr7wcNEfMSzTdWygKnhtyNAB7QzXOq"

def __init__(self, proxies: list[str] | str | None = None):
"""
Initializes MonsterScraper
"""
super().__init__(Site.MONSTER, proxies=proxies)

self.scraper_input = None
self.session = create_session(proxies=proxies)
# self.search_id = "0979dd0c-9886-45ac-b7e3-9395f74f775"
# self.fingerprint_id = "7144F133-D147-41EB-ADFF-67B44D61BEEF"
self.search_id = str(uuid.uuid4())
self.fingerprint_id = str(uuid.uuid4()).upper()

self.jobs_per_page = 50
self.seen_urls = set()

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""
Scrapes Monster for jobs with scraper_input criteria.
:param scraper_input: Information about job search criteria.
:return: JobResponse containing a list of jobs.
"""
self.scraper_input = scraper_input
job_list: list[JobPost] = []

max_pages = math.ceil(scraper_input.results_wanted / self.jobs_per_page)
for page in range(1, min(11, max_pages + 1)):
if len(job_list) >= scraper_input.results_wanted:
break
logger.info(f"Monster search page: {page}")
jobs_on_page = self._find_jobs_in_page(scraper_input, page)
if jobs_on_page:
job_list.extend(jobs_on_page)
else:
break
return JobResponse(jobs=job_list[: scraper_input.results_wanted])

def _find_jobs_in_page(self, scraper_input: ScraperInput, page: int) -> [JobPost]:
"""
Scrapes a page of Monster for jobs with scraper_input criteria
:param scraper_input:
:param page:
:return: jobs found on page
"""
jobs_list = []
payload = self._add_payload(scraper_input, (page - 1) * 50)
try:
res = self.session.post(self.api_url, headers=self.headers, json=payload)
if res.status_code not in range(200, 400):
if res.status_code == 429:
err = "429 Response - Blocked by Monster for too many requests"
else:
err = f"Monster response status code {res.status_code} with response: {res.text}"
logger.error(err)
return jobs_list
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Monster: Bad proxy")
else:
logger.error(f"Monster: {str(e)}")
return jobs_list

res_data = res.json()
raw_jobs_list = res_data.get("jobResults", [])
with ThreadPoolExecutor(max_workers=self.jobs_per_page) as executor:
job_results = [
executor.submit(self._process_job, job) for job in raw_jobs_list
]

job_list = list(filter(None, (result.result() for result in job_results)))
return job_list

def _process_job(self, job: dict) -> JobPost | None:
"""
Processes an individual job dict from the response
"""
job_posting = job["jobPosting"]
title = job_posting.get("title")
job_url = f"{self.base_url}{job['jobId']}"
if job_url in self.seen_urls:
return
self.seen_urls.add(job_url)
job_url_direct = (
job["apply"].get("applyUrl")
if job.get("apply")
and "monster.com" not in job["apply"].get("applyUrl", "")
else None
)

description = job_posting.get("description", "")
description = (
markdown_converter(description)
if self.scraper_input.description_format == DescriptionFormat.MARKDOWN
else description
)
company = job_posting.get("hiringOrganization", {}).get("name")

location_dict = (
job_posting["jobLocation"][0].get("address", {})
if job_posting.get("jobLocation")
else {}
)
location = Location(
city=location_dict.get("addressLocality"),
state=location_dict.get("addressRegion"),
country=location_dict.get("addressCountry"),
)
date_posted = parse(job_posting["datePosted"]).date()

return JobPost(
id=job["jobId"],
title=title,
company_name=company,
location=location,
date_posted=date_posted,
job_url=job_url,
description=description,
emails=extract_emails_from_text(description) if description else None,
job_url_direct=job_url_direct,
)

def _add_payload(self, scraper_input, offset) -> str:
payload = {
"jobAdsRequest": {
"position": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"placement": {
"property": "MobileApp",
"view": "CARD",
"type": "JOB_SEARCH",
"location": "JobSearchPage",
"channel": "MOBILE",
},
},
"searchId": self.search_id,
"offset": offset,
"pageSize": self.jobs_per_page,
"fingerprintId": self.fingerprint_id,
"jobQuery": {
"query": scraper_input.search_term,
"locations": [
{
"address": scraper_input.location,
"country": "US",
"radius": {"value": scraper_input.distance, "unit": "mi"},
}
],
},
}
return json.dumps({k: v for k, v in payload.items() if v is not None})

headers = {
"Host": "appsapi.monster.io",
"accept": "*/*",
"content-type": "application/json",
"user-agent": "Jobr/17.0.0 (com.jobrapp.ios; build:17000.14; iOS 17.5.1) Alamofire/5.8.0",
"accept-language": "en-US;q=1.0",
}
4 changes: 2 additions & 2 deletions src/jobspy/scrapers/ziprecruiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,9 @@ def _find_jobs_in_page(
return jobs_list, ""
except Exception as e:
if "Proxy responded with" in str(e):
logger.error(f"Indeed: Bad proxy")
logger.error(f"ZipRecruiter: Bad proxy")
else:
logger.error(f"Indeed: {str(e)}")
logger.error(f"ZipRecruiter: {str(e)}")
return jobs_list, ""

res_data = res.json()
Expand Down

0 comments on commit ce831c8

Please sign in to comment.