Skip to content

Commit

Permalink
merged dockerfiles, updated requirements and pinned them, updated to …
Browse files Browse the repository at this point in the history
…python 3.11, fixed random and byname scrapers, deprecated docker-compose using docker compose instead,
  • Loading branch information
eracle committed Oct 4, 2024
1 parent 1365ddc commit ae8b6fd
Show file tree
Hide file tree
Showing 18 changed files with 144 additions and 104 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ __pycache__/
.pytest_cache/
.vscode
.dockerignore

/data/companies.csv
/data/output.csv
/data/companies/
16 changes: 8 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,26 @@ view: ## view the Selenium browser's activity
vinagre localhost:5900

companies: build ## run the 'companies' Scrapy spider
docker-compose up --build scrapy_companies
docker compose up --build scrapy_companies

random: build ## run the 'random' Scrapy spider
docker-compose up scrapy_random
docker compose up scrapy_random

byname: build ## run the 'byname' Scrapy spider
docker-compose up scrapy_byname
docker compose up scrapy_byname

test: ## run Pytest on the 'tests/*' directory
docker-compose up scrapy_test
docker compose up scrapy_test

attach: ## follow the logs of the 'scrapy' service
docker-compose logs -f
docker compose logs -f

stop: ## stop all services defined in Docker Compose
docker-compose stop
docker compose stop

build: ## build all services defined in Docker Compose
docker-compose build
docker compose build

selenium: ## launches selenium
docker-compose up -d selenium
docker compose up -d selenium

3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ The software can be configured to send custom connection messages to LinkedIn us

You will need the following:

- Docker
- Docker Compose
- Docker, I am using version 27.3.1, be sure you have docker compose
- A VNC viewer (e.g., Vinagre for Ubuntu)

### Steps
Expand Down
29 changes: 29 additions & 0 deletions compose/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Use the base Python image
FROM python:3.11

# Define the application directory
ARG APP_HOME=/app
WORKDIR ${APP_HOME}

# Define a build argument to determine which environment to use
ARG BUILD_ENV=production

# Copy requirements folder and install the dependencies based on BUILD_ENV
COPY ./requirements /requirements
RUN pip install -r /requirements/${BUILD_ENV}.txt

# Copy necessary startup scripts
COPY ./compose/start /start
RUN sed -i 's/\r$//g' /start && chmod +x /start

COPY ./compose/start_companies /start_companies
RUN sed -i 's/\r$//g' /start_companies && chmod +x /start_companies

COPY ./compose/wait-for-selenium /wait-for-selenium
RUN sed -i 's/\r$//g' /wait-for-selenium && chmod +x /wait-for-selenium

# Copy the entire application code to the app directory
COPY . ${APP_HOME}

# Set the working directory
WORKDIR ${APP_HOME}
11 changes: 0 additions & 11 deletions compose/local/Dockerfile

This file was deleted.

25 changes: 0 additions & 25 deletions compose/production/Dockerfile

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,4 @@ set -o nounset

/wait-for-selenium http://selenium:4444/wd/hub

# Use $1 to get the first argument passed to the script
python sequential_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@ set -e

url="$1"
shift
cmd="$@"

until wget -O- "$url"; do
>&2 echo "Selenium is unavailable - sleeping"
sleep 1
done

>&2 echo "Selenium is up - executing command"
exec $cmd
>&2 echo "Selenium is up"

8 changes: 4 additions & 4 deletions conf_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# Keep it None to disable personalized Icebreakers generation
OPENAI_API_KEY = None

CONNECTION_REQUEST_LLM_PROMPT = """Act as a LinkedIn content creator reaching out to a professional on LinkedIn.
CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE = """Act as a LinkedIn content creator reaching out to a professional on LinkedIn.
Craft a connection request message referencing their past work experiences, showcasing that you've reviewed their
profile, include specific details. Identify from their profile their primary language and write the message in that
language. Do not include any line with subject or Primary language.
Expand All @@ -33,15 +33,15 @@
# networking on LinkedIn, but use it with caution. Excessive connection requests
# can lead to your LinkedIn account being flagged or banned.
# If set to False, the spider will only scrape data without sending any connection requests.
SEND_CONNECTION_REQUESTS = True
SEND_CONNECTION_REQUESTS = False

# Feature Flag: SELECTIVE_SCRAPING
# If set to True, the scraper will ignore some profiles based on some role base filters
SELECTIVE_SCRAPING = True
SELECTIVE_SCRAPING = False

# List of roles to select either in connection requests when
# SEND_CONNECTION_REQUESTS is enabled or simply to scrape and enrich
ROLES_FOR_CONNECTION_REQUESTS = [
ROLES_KEYWORDS = [
"CEO",
"CTO",
"CFO",
Expand Down
1 change: 1 addition & 0 deletions data/companies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://www.linkedin.com/company/google/
10 changes: 6 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
version: '3'

x-scrapy-common: &scrapy-common
build:
context: .
dockerfile: compose/production/Dockerfile
dockerfile: compose/Dockerfile # Use the merged Dockerfile
args:
BUILD_ENV: "production" # Use production environment
environment:
- PYTHONUNBUFFERED=0
depends_on:
Expand Down Expand Up @@ -36,12 +37,13 @@ services:
scrapy_test:
build:
context: .
dockerfile: compose/local/Dockerfile
dockerfile: compose/Dockerfile # Use the same merged Dockerfile
args:
BUILD_ENV: "local" # Specify the local environment
environment:
- PYTHONUNBUFFERED=0
depends_on:
- selenium
volumes:
- .:/app
- ./data:/app/data
command: [ "py.test", "tests/companies.py", "tests/selenium.py"]
4 changes: 2 additions & 2 deletions linkedin/integrations/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, log=False):


def is_security_check(driver):
return get_by_xpath_or_none(driver, f'//h1[contains(text(), "security check")]')
return get_by_xpath_or_none(driver, f'//h1[contains(text(), "security check")]', 3)


def build_driver(login=False):
def build_driver(login=True):
SELENIUM_HOSTNAME = "selenium"
selenium_url = f"http://{SELENIUM_HOSTNAME}:4444/wd/hub"
chrome_options = webdriver.ChromeOptions()
Expand Down
59 changes: 38 additions & 21 deletions linkedin/spiders/by_name.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from urllib.parse import urlencode

from scrapy import Request
Expand All @@ -7,40 +8,56 @@

logger = logging.getLogger(__name__)

NAMES_FILE = "data/names.txt"
NAMES_FILE = "/app/data/names.txt"
BASE_SEARCH_URL = "https://www.linkedin.com/search/results/people/"


class ByNameSpider(SearchSpider):
"""
Spider who searches People by name.
"""

name = "byname"

def __init__(self, *args, **kwargs):
# Initialize SearchSpider with a default start_url
start_url = BASE_SEARCH_URL
super().__init__(start_url=start_url, *args, **kwargs)

def start_requests(self):
# Check if the file exists before trying to read it
if not os.path.isfile(NAMES_FILE):
logger.error(f"Names file {NAMES_FILE} not found. Please ensure the file exists.")
return # Stop execution if the file is missing

# Read the names from the file and handle empty files
with open(NAMES_FILE, "rt") as f:
names = [line.rstrip() for line in f]
if len(names) > 1:
logger.warning(
f"At the moment accepting only one name in {NAMES_FILE}, ignoring the rest"
)

searched_name = names[0]
logging.debug(f"encoded_name: {searched_name.lower()}")
params = {
"origin": "GLOBAL_SEARCH_HEADER",
"keywords": searched_name.lower(),
"page": 1,
}
search_url = BASE_SEARCH_URL + "?" + urlencode(params)

yield Request(
url=search_url,
callback=super().parse_search_list,
meta={"searched_name": searched_name},
names = [line.rstrip() for line in f if line.strip()] # Ignore empty lines

if not names:
logger.error(f"Names file {NAMES_FILE} is empty. Please provide at least one name.")
return # Stop execution if the file is empty

# Limit to the first name if there are multiple
if len(names) > 1:
logger.warning(
f"At the moment accepting only one name in {NAMES_FILE}, ignoring the rest"
)

searched_name = names[0]
logger.debug(f"encoded_name: {searched_name.lower()}")
params = {
"origin": "GLOBAL_SEARCH_HEADER",
"keywords": searched_name.lower(),
"page": 1,
}
search_url = BASE_SEARCH_URL + "?" + urlencode(params)

yield Request(
url=search_url,
callback=super().parse_search_list,
meta={"searched_name": searched_name},
)

def should_stop(self, response):
name_set = set(response.meta["searched_name"].lower().strip().split())

Expand Down
6 changes: 5 additions & 1 deletion linkedin/spiders/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scrapy.spiders import CrawlSpider, Rule

from linkedin.integrations.linkedin_api import extract_profile_id
from linkedin.integrations.selenium import get_by_xpath_or_none
from linkedin.integrations.selenium import get_by_xpath_or_none, build_driver
from linkedin.middlewares.selenium import SeleniumSpiderMixin

"""
Expand All @@ -12,6 +12,10 @@


class RandomSpider(CrawlSpider, SeleniumSpiderMixin):
def __init__(self, driver=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.driver = driver or build_driver()

name = "random"
allowed_domains = ("linkedin.com",)
start_urls = [
Expand Down
14 changes: 7 additions & 7 deletions linkedin/spiders/search.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import logging
from time import sleep

from langchain.llms import OpenAI
from langchain_community.llms.openai import OpenAI
from scrapy import Request, Spider
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from conf import (
CONNECTION_REQUEST_LLM_PROMPT,
CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE,
DEFAULT_CONNECTION_MESSAGE,
MAX_PROFILES_TO_CONNECT,
MAX_PROFILES_TO_SCRAPE,
Expand Down Expand Up @@ -107,7 +107,7 @@ def skip_profile(user_profile):
def generate_connection_message(llm: OpenAI, user_profile):
from langchain import PromptTemplate

prompt_template = PromptTemplate.from_template(CONNECTION_REQUEST_LLM_PROMPT)
prompt_template = PromptTemplate.from_template(CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE)

prompt = prompt_template.format(profile=user_profile)
logger.debug(f"Generate message with prompt:\n{prompt}:")
Expand Down Expand Up @@ -254,10 +254,10 @@ def check_if_no_results_found(self, driver):
"//div[contains(@class, 'search-reusable-search-no-results')]"
)
return (
get_by_xpath_or_none(
driver=driver, xpath=no_result_found_xpath, wait_timeout=3
)
is not None
get_by_xpath_or_none(
driver=driver, xpath=no_result_found_xpath, wait_timeout=3
)
is not None
)

def get_next_url(self, response):
Expand Down
11 changes: 7 additions & 4 deletions requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Scrapy
Scrapy==2.11.0 # https://github.com/scrapy/scrapy
Scrapy==2.11.2 # https://github.com/scrapy/scrapy

# Selenium
selenium==4.10.0 # https://github.com/SeleniumHQ/selenium

# Linkedin API library
linkedin-api<3.0.0
linkedin-api==2.3.0

langchain # https://pypi.org/project/langchain/
openai # https://pypi.org/project/openai/
# langchain
langchain==0.3.2 # https://pypi.org/project/langchain/
langchain-community==0.3.1

openai==1.51.0 # https://pypi.org/project/openai/
Loading

0 comments on commit ae8b6fd

Please sign in to comment.