merged dockerfiles, updated requirements and pinned them, updated to …

…python 3.11, fixed random and byname scrapers, deprecated docker-compose using docker compose instead,
eracle · Oct 4, 2024 · ae8b6fd · ae8b6fd
1 parent 1365ddc
commit ae8b6fd
Show file tree

Hide file tree

Showing 18 changed files with 144 additions and 104 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@ __pycache__/
 .pytest_cache/
 .vscode
 .dockerignore
+
+/data/companies.csv
 /data/output.csv
-/data/companies/
diff --git a/Makefile b/Makefile
@@ -7,26 +7,26 @@ view: ## view the Selenium browser's activity
 	vinagre localhost:5900
 
 companies: build ## run the 'companies' Scrapy spider
-	docker-compose up --build scrapy_companies
+	docker compose up --build scrapy_companies
 
 random: build ## run the 'random' Scrapy spider
-	docker-compose up scrapy_random
+	docker compose up scrapy_random
 
 byname: build ## run the 'byname' Scrapy spider
-	docker-compose up scrapy_byname
+	docker compose up scrapy_byname
 
 test: ## run Pytest on the 'tests/*' directory
-	docker-compose up scrapy_test
+	docker compose up scrapy_test
 
 attach: ## follow the logs of the 'scrapy' service
-	docker-compose logs -f
+	docker compose logs -f
 
 stop: ## stop all services defined in Docker Compose
-	docker-compose stop
+	docker compose stop
 
 build: ## build all services defined in Docker Compose
-	docker-compose build
+	docker compose build
 
 selenium: ## launches selenium
-	docker-compose up -d selenium
+	docker compose up -d selenium
 
diff --git a/README.md b/README.md
@@ -33,8 +33,7 @@ The software can be configured to send custom connection messages to LinkedIn us
 
 You will need the following:
 
-- Docker
-- Docker Compose
+- Docker, I am using version  27.3.1, be sure you have docker compose
 - A VNC viewer (e.g., Vinagre for Ubuntu)
 
 ### Steps

diff --git a/compose/Dockerfile b/compose/Dockerfile
@@ -0,0 +1,29 @@
+# Use the base Python image
+FROM python:3.11
+
+# Define the application directory
+ARG APP_HOME=/app
+WORKDIR ${APP_HOME}
+
+# Define a build argument to determine which environment to use
+ARG BUILD_ENV=production
+
+# Copy requirements folder and install the dependencies based on BUILD_ENV
+COPY ./requirements /requirements
+RUN pip install -r /requirements/${BUILD_ENV}.txt
+
+# Copy necessary startup scripts
+COPY ./compose/start /start
+RUN sed -i 's/\r$//g' /start && chmod +x /start
+
+COPY ./compose/start_companies /start_companies
+RUN sed -i 's/\r$//g' /start_companies && chmod +x /start_companies
+
+COPY ./compose/wait-for-selenium /wait-for-selenium
+RUN sed -i 's/\r$//g' /wait-for-selenium && chmod +x /wait-for-selenium
+
+# Copy the entire application code to the app directory
+COPY . ${APP_HOME}
+
+# Set the working directory
+WORKDIR ${APP_HOME}
diff --git a/compose/local/Dockerfile b/compose/local/Dockerfile
diff --git a/compose/production/Dockerfile b/compose/production/Dockerfile
diff --git a/compose/production/start → compose/start b/compose/production/start → compose/start
diff --git a/compose/production/start_companies → compose/start_companies b/compose/production/start_companies → compose/start_companies
@@ -6,5 +6,4 @@ set -o nounset
 
 /wait-for-selenium http://selenium:4444/wd/hub
 
-# Use $1 to get the first argument passed to the script
 python sequential_run.py
diff --git a/compose/production/wait-for-selenium → compose/wait-for-selenium b/compose/production/wait-for-selenium → compose/wait-for-selenium
@@ -5,12 +5,11 @@ set -e
 
 url="$1"
 shift
-cmd="$@"
 
 until wget -O- "$url"; do
   >&2 echo "Selenium is unavailable - sleeping"
   sleep 1
 done
 
->&2 echo "Selenium is up - executing command"
-exec $cmd
+>&2 echo "Selenium is up"
+
diff --git a/conf_template.py b/conf_template.py
@@ -14,7 +14,7 @@
 # Keep it None to disable personalized Icebreakers generation
 OPENAI_API_KEY = None
 
-CONNECTION_REQUEST_LLM_PROMPT = """Act as a LinkedIn content creator reaching out to a professional on LinkedIn. 
+CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE = """Act as a LinkedIn content creator reaching out to a professional on LinkedIn. 
 Craft a connection request message referencing their past work experiences, showcasing that you've reviewed their 
 profile, include specific details. Identify from their profile their primary language and write the message in that 
 language. Do not include any line with subject or Primary language.
@@ -33,15 +33,15 @@
 # networking on LinkedIn, but use it with caution. Excessive connection requests
 # can lead to your LinkedIn account being flagged or banned.
 # If set to False, the spider will only scrape data without sending any connection requests.
-SEND_CONNECTION_REQUESTS = True
+SEND_CONNECTION_REQUESTS = False
 
 # Feature Flag: SELECTIVE_SCRAPING
 # If set to True, the scraper will ignore some profiles based on some role base filters
-SELECTIVE_SCRAPING = True
+SELECTIVE_SCRAPING = False
 
 # List of roles to select either in connection requests when
 # SEND_CONNECTION_REQUESTS is enabled or simply to scrape and enrich
-ROLES_FOR_CONNECTION_REQUESTS = [
+ROLES_KEYWORDS = [
     "CEO",
     "CTO",
     "CFO",

diff --git a/data/companies.txt b/data/companies.txt
@@ -0,0 +1 @@
+https://www.linkedin.com/company/google/
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,9 +1,10 @@
-version: '3'
 
 x-scrapy-common: &scrapy-common
   build:
     context: .
-    dockerfile: compose/production/Dockerfile
+    dockerfile: compose/Dockerfile  # Use the merged Dockerfile
+    args:
+      BUILD_ENV: "production"  # Use production environment
   environment:
     - PYTHONUNBUFFERED=0
   depends_on:
@@ -36,12 +37,13 @@ services:
   scrapy_test:
     build:
       context: .
-      dockerfile: compose/local/Dockerfile
+      dockerfile: compose/Dockerfile  # Use the same merged Dockerfile
+      args:
+        BUILD_ENV: "local"  # Specify the local environment
     environment:
       - PYTHONUNBUFFERED=0
     depends_on:
       - selenium
     volumes:
       - .:/app
-      - ./data:/app/data
     command: [ "py.test", "tests/companies.py", "tests/selenium.py"]
diff --git a/linkedin/integrations/selenium.py b/linkedin/integrations/selenium.py
@@ -76,10 +76,10 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, log=False):
 
 
 def is_security_check(driver):
-    return get_by_xpath_or_none(driver, f'//h1[contains(text(), "security check")]')
+    return get_by_xpath_or_none(driver, f'//h1[contains(text(), "security check")]', 3)
 
 
-def build_driver(login=False):
+def build_driver(login=True):
     SELENIUM_HOSTNAME = "selenium"
     selenium_url = f"http://{SELENIUM_HOSTNAME}:4444/wd/hub"
     chrome_options = webdriver.ChromeOptions()

diff --git a/linkedin/spiders/by_name.py b/linkedin/spiders/by_name.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from urllib.parse import urlencode
 
 from scrapy import Request
@@ -7,40 +8,56 @@
 
 logger = logging.getLogger(__name__)
 
-NAMES_FILE = "data/names.txt"
+NAMES_FILE = "/app/data/names.txt"
 BASE_SEARCH_URL = "https://www.linkedin.com/search/results/people/"
 
-
 class ByNameSpider(SearchSpider):
     """
     Spider who searches People by name.
     """
 
     name = "byname"
 
+    def __init__(self, *args, **kwargs):
+        # Initialize SearchSpider with a default start_url
+        start_url = BASE_SEARCH_URL
+        super().__init__(start_url=start_url, *args, **kwargs)
+
     def start_requests(self):
+        # Check if the file exists before trying to read it
+        if not os.path.isfile(NAMES_FILE):
+            logger.error(f"Names file {NAMES_FILE} not found. Please ensure the file exists.")
+            return  # Stop execution if the file is missing
+
+        # Read the names from the file and handle empty files
         with open(NAMES_FILE, "rt") as f:
-            names = [line.rstrip() for line in f]
-            if len(names) > 1:
-                logger.warning(
-                    f"At the moment accepting only one name in {NAMES_FILE}, ignoring the rest"
-                )
-
-            searched_name = names[0]
-            logging.debug(f"encoded_name: {searched_name.lower()}")
-            params = {
-                "origin": "GLOBAL_SEARCH_HEADER",
-                "keywords": searched_name.lower(),
-                "page": 1,
-            }
-            search_url = BASE_SEARCH_URL + "?" + urlencode(params)
-
-            yield Request(
-                url=search_url,
-                callback=super().parse_search_list,
-                meta={"searched_name": searched_name},
+            names = [line.rstrip() for line in f if line.strip()]  # Ignore empty lines
+
+        if not names:
+            logger.error(f"Names file {NAMES_FILE} is empty. Please provide at least one name.")
+            return  # Stop execution if the file is empty
+
+        # Limit to the first name if there are multiple
+        if len(names) > 1:
+            logger.warning(
+                f"At the moment accepting only one name in {NAMES_FILE}, ignoring the rest"
             )
 
+        searched_name = names[0]
+        logger.debug(f"encoded_name: {searched_name.lower()}")
+        params = {
+            "origin": "GLOBAL_SEARCH_HEADER",
+            "keywords": searched_name.lower(),
+            "page": 1,
+        }
+        search_url = BASE_SEARCH_URL + "?" + urlencode(params)
+
+        yield Request(
+            url=search_url,
+            callback=super().parse_search_list,
+            meta={"searched_name": searched_name},
+        )
+
     def should_stop(self, response):
         name_set = set(response.meta["searched_name"].lower().strip().split())
 

diff --git a/linkedin/spiders/random.py b/linkedin/spiders/random.py
@@ -2,7 +2,7 @@
 from scrapy.spiders import CrawlSpider, Rule
 
 from linkedin.integrations.linkedin_api import extract_profile_id
-from linkedin.integrations.selenium import get_by_xpath_or_none
+from linkedin.integrations.selenium import get_by_xpath_or_none, build_driver
 from linkedin.middlewares.selenium import SeleniumSpiderMixin
 
 """
@@ -12,6 +12,10 @@
 
 
 class RandomSpider(CrawlSpider, SeleniumSpiderMixin):
+    def __init__(self, driver=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver = driver or build_driver()
+
     name = "random"
     allowed_domains = ("linkedin.com",)
     start_urls = [

diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py
@@ -1,13 +1,13 @@
 import logging
 from time import sleep
 
-from langchain.llms import OpenAI
+from langchain_community.llms.openai import OpenAI
 from scrapy import Request, Spider
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 
 from conf import (
-    CONNECTION_REQUEST_LLM_PROMPT,
+    CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE,
     DEFAULT_CONNECTION_MESSAGE,
     MAX_PROFILES_TO_CONNECT,
     MAX_PROFILES_TO_SCRAPE,
@@ -107,7 +107,7 @@ def skip_profile(user_profile):
 def generate_connection_message(llm: OpenAI, user_profile):
     from langchain import PromptTemplate
 
-    prompt_template = PromptTemplate.from_template(CONNECTION_REQUEST_LLM_PROMPT)
+    prompt_template = PromptTemplate.from_template(CONNECTION_REQUEST_LLM_PROMPT_TEMPLATE)
 
     prompt = prompt_template.format(profile=user_profile)
     logger.debug(f"Generate message with prompt:\n{prompt}:")
@@ -254,10 +254,10 @@ def check_if_no_results_found(self, driver):
             "//div[contains(@class, 'search-reusable-search-no-results')]"
         )
         return (
-            get_by_xpath_or_none(
-                driver=driver, xpath=no_result_found_xpath, wait_timeout=3
-            )
-            is not None
+                get_by_xpath_or_none(
+                    driver=driver, xpath=no_result_found_xpath, wait_timeout=3
+                )
+                is not None
         )
 
     def get_next_url(self, response):

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -1,11 +1,14 @@
 # Scrapy
-Scrapy==2.11.0 # https://github.com/scrapy/scrapy
+Scrapy==2.11.2 # https://github.com/scrapy/scrapy
 
 # Selenium
 selenium==4.10.0 # https://github.com/SeleniumHQ/selenium
 
 # Linkedin API library
-linkedin-api<3.0.0
+linkedin-api==2.3.0
 
-langchain # https://pypi.org/project/langchain/
-openai # https://pypi.org/project/openai/
+# langchain
+langchain==0.3.2 # https://pypi.org/project/langchain/
+langchain-community==0.3.1
+
+openai==1.51.0 # https://pypi.org/project/openai/
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,5 +6,4 @@ set -o nounset

		/wait-for-selenium http://selenium:4444/wd/hub

		# Use $1 to get the first argument passed to the script
		python sequential_run.py