Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans committed Jan 21, 2024
1 parent c980ed5 commit 64347a8
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 97 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/auto-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
jobs:
auto-merge-dependabot:
runs-on: ubuntu-latest
if: ${{ github.actor == 'dependabot[bot]' }}
steps:
- id: dependabot-metadata
uses: dependabot/fetch-metadata@v1
- if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' }}
run: gh pr merge --auto --delete-branch --squash "${PR_URL}"
env:
PR_URL: ${{ github.event.pull_request.html_url }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
auto-merge-updates:
runs-on: ubuntu-latest
if: ${{ github.actor == 'monosans' && startsWith(github.head_ref, 'update/') }}
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/update-dependencies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
strategy:
matrix:
include:
- cmd: pipx run poetry lock --no-interaction
commit-msg: Update poetry.lock
branch: update/poetry-lock
- cmd: pipx run pre-commit autoupdate
commit-msg: Update .pre-commit-config.yaml
branch: update/pre-commit-config
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -376,4 +376,4 @@ $RECYCLE.BIN/

# End of https://www.toptal.com/developers/gitignore/api/jetbrains+all,linux,macos,python,vim,visualstudiocode,windows

results/
out/
18 changes: 9 additions & 9 deletions proxy_scraper_checker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TextColumn
from rich.table import Table

from . import cache, checker, geodb, http, output, scraper, utils
from . import cache, checker, geodb, http, output, scraper, sort, utils
from .settings import Settings
from .storage import ProxyStorage
from .typing_compat import Any
Expand Down Expand Up @@ -75,14 +75,14 @@ def configure_logging(*, console: Console, debug: bool) -> None:
)


def get_results_table(
def get_summary_table(
*, before: Mapping[ProxyType, int], after: Mapping[ProxyType, int]
) -> Table:
table = Table()
table.add_column("Protocol", style="cyan")
table.add_column("Working", style="magenta")
table.add_column("Total", style="green")
for proto in ProxyType:
for proto in sort.PROTOCOL_ORDER:
total = before.get(proto)
if total:
working = after.get(proto, 0)
Expand Down Expand Up @@ -112,16 +112,17 @@ async def main() -> None:
BarColumn(),
MofNCompleteColumn(),
console=console,
transient=True,
) as progress:
scrape = scraper.scrape_all_sources(
scrape = scraper.scrape_all(
progress=progress,
session=session,
settings=settings,
storage=storage,
)
await (
asyncio.gather(
cache.create_cache(),
cache.create(),
geodb.download_geodb(progress=progress, session=session),
scrape,
)
Expand All @@ -130,7 +131,7 @@ async def main() -> None:
)
await session.close()
count_before_checking = storage.get_count()
await checker.check_all_proxies(
await checker.check_all(
settings=settings,
storage=storage,
progress=progress,
Expand All @@ -139,16 +140,15 @@ async def main() -> None:

count_after_checking = storage.get_count()
console.print(
get_results_table(
get_summary_table(
before=count_before_checking, after=count_after_checking
)
)

await output.save_proxies(storage=storage, settings=settings)

logger.info(
"Thank you for using "
"https://github.com/monosans/proxy-scraper-checker"
"Thank you for using https://github.com/monosans/proxy-scraper-checker"
)


Expand Down
10 changes: 5 additions & 5 deletions proxy_scraper_checker/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
import aiofiles.os
import platformdirs

CACHE_DIR = platformdirs.user_cache_dir("proxy_scraper_checker")
CACHE_READY = asyncio.Event()
DIR = platformdirs.user_cache_dir("proxy_scraper_checker")
READY_EVENT = asyncio.Event()


async def create_cache() -> None:
await aiofiles.os.makedirs(CACHE_DIR, exist_ok=True)
CACHE_READY.set()
async def create() -> None:
await aiofiles.os.makedirs(DIR, exist_ok=True)
READY_EVENT.set()
48 changes: 24 additions & 24 deletions proxy_scraper_checker/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,29 @@
logger = logging.getLogger(__name__)


async def check_all_proxies(
async def check_one(
*,
progress: Progress,
proxy: Proxy,
settings: Settings,
storage: ProxyStorage,
task: TaskID,
) -> None:
try:
await proxy.check(settings=settings)
except Exception as e:
# Too many open files
if isinstance(e, OSError) and e.errno == 24: # noqa: PLR2004
logger.error("Please, set max_connections to lower value")

logger.debug(
"%s.%s: %s", e.__class__.__module__, e.__class__.__qualname__, e
)
storage.remove(proxy)
progress.update(task, advance=1)


async def check_all(
*,
settings: Settings,
storage: ProxyStorage,
Expand All @@ -32,7 +54,7 @@ async def check_all_proxies(
if proto in storage.enabled_protocols
}
coroutines = [
check_proxy(
check_one(
progress=progress,
proxy=proxy,
settings=settings,
Expand All @@ -43,25 +65,3 @@ async def check_all_proxies(
]
shuffle(coroutines)
await asyncio.gather(*coroutines)


async def check_proxy(
*,
progress: Progress,
proxy: Proxy,
settings: Settings,
storage: ProxyStorage,
task: TaskID,
) -> None:
try:
await proxy.check(settings=settings)
except Exception as e:
# Too many open files
if isinstance(e, OSError) and e.errno == 24: # noqa: PLR2004
logger.error("Please, set max_connections to lower value")

logger.debug(
"%s.%s: %s", e.__class__.__module__, e.__class__.__qualname__, e
)
storage.remove(proxy)
progress.update(task, advance=1)
8 changes: 3 additions & 5 deletions proxy_scraper_checker/geodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
from aiohttp import ClientResponse, ClientSession, hdrs
from rich.progress import Progress, TaskID

from proxy_scraper_checker import cache

from .cache import CACHE_DIR
from . import cache
from .utils import bytes_decode

logger = logging.getLogger(__name__)

GEODB_URL = "https://raw.githubusercontent.com/P3TERX/GeoLite.mmdb/download/GeoLite2-City.mmdb"
GEODB_PATH = Path(CACHE_DIR, "geolocation_database.mmdb")
GEODB_PATH = Path(cache.DIR, "geolocation_database.mmdb")
GEODB_ETAG_PATH = GEODB_PATH.with_suffix(".mmdb.etag")


Expand Down Expand Up @@ -62,7 +60,7 @@ async def download_geodb(*, progress: Progress, session: ClientSession) -> None:
GEODB_PATH,
)
return
await cache.CACHE_READY.wait()
await cache.READY_EVENT.wait()
await _save_geodb(
progress=progress,
response=response,
Expand Down
64 changes: 32 additions & 32 deletions proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,7 @@
logger = logging.getLogger(__name__)


async def scrape_all_sources(
*,
progress: Progress,
session: ClientSession,
settings: Settings,
storage: ProxyStorage,
) -> None:
tasks = {
proto: progress.add_task(
f"[yellow]Scraper [red]:: [green]{proto.name}", total=len(sources)
)
for proto, sources in settings.sources.items()
}
timeout = ClientTimeout(total=settings.source_timeout)
coroutines = (
scrape_source(
progress=progress,
proto=proto,
session=session,
source=source,
storage=storage,
task=tasks[proto],
timeout=timeout,
)
for proto, sources in settings.sources.items()
for source in sources
)
await asyncio.gather(*coroutines)


async def scrape_source(
async def scrape_one(
*,
progress: Progress,
proto: ProxyType,
Expand All @@ -71,7 +41,7 @@ async def scrape_source(
content = await f.read()
text = bytes_decode(content)
except Exception as e:
logger.error(
logger.warning(
"%s | %s.%s: %s",
source,
e.__class__.__module__,
Expand Down Expand Up @@ -105,3 +75,33 @@ async def scrape_source(
)
)
progress.update(task, advance=1)


async def scrape_all(
*,
progress: Progress,
session: ClientSession,
settings: Settings,
storage: ProxyStorage,
) -> None:
tasks = {
proto: progress.add_task(
f"[yellow]Scraper [red]:: [green]{proto.name}", total=len(sources)
)
for proto, sources in settings.sources.items()
}
timeout = ClientTimeout(total=settings.source_timeout)
coroutines = (
scrape_one(
progress=progress,
proto=proto,
session=session,
source=source,
storage=storage,
task=tasks[proto],
timeout=timeout,
)
for proto, sources in settings.sources.items()
for source in sources
)
await asyncio.gather(*coroutines)
42 changes: 21 additions & 21 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,6 @@
logger = logging.getLogger(__name__)


def _get_max_connections(value: int, /) -> Optional[int]:
if value < 0:
msg = "max_connections must be non-negative"
raise ValueError(msg)
max_supported = _get_supported_max_connections()
if not value:
logger.info("Using %d as max_connections value", max_supported or 0)
return max_supported
if not max_supported or value <= max_supported:
return value
logger.warning(
"max_connections value is too high. "
"Your OS supports a maximum of %d. "
"The config value will be ignored and %d will be used.",
max_supported,
max_supported,
)
return max_supported


def _get_supported_max_connections() -> Optional[int]:
if sys.platform == "win32":
if isinstance(
Expand All @@ -67,7 +47,7 @@ def _get_supported_max_connections() -> Optional[int]:

soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
logger.debug(
"max_connections soft limit = %d, hard limit = %d, infinity = %d",
"max_connections: soft limit = %d, hard limit = %d, infinity = %d",
soft_limit,
hard_limit,
resource.RLIM_INFINITY,
Expand All @@ -84,6 +64,26 @@ def _get_supported_max_connections() -> Optional[int]:
return soft_limit


def _get_max_connections(value: int, /) -> Optional[int]:
if value < 0:
msg = "max_connections must be non-negative"
raise ValueError(msg)
max_supported = _get_supported_max_connections()
if not value:
logger.info("Using %d as max_connections value", max_supported or 0)
return max_supported
if not max_supported or value <= max_supported:
return value
logger.warning(
"max_connections value is too high. "
"Your OS supports a maximum of %d. "
"The config value will be ignored and %d will be used.",
max_supported,
max_supported,
)
return max_supported


def _semaphore_converter(
value: int, /
) -> Union[asyncio.Semaphore, NullContext]:
Expand Down

0 comments on commit 64347a8

Please sign in to comment.