Skip to content

Commit

Permalink
Fixing scrape/extract shenanigans wrt -I,--input-dir
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Sep 6, 2024
1 parent b464bfd commit a190c5e
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 5 deletions.
5 changes: 5 additions & 0 deletions minet/cli/extract/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ def action(cli_args, enricher: IndexedEnricher, loading_bar):
if not cli_args.glob:
loading_bar.set_total(enricher.total)

if not cli_args.has_dummy_csv and cli_args.input_dir is None:
from minet.cli.constants import DEFAULT_CONTENT_FOLDER

cli_args.input_dir = DEFAULT_CONTENT_FOLDER

items = create_fetch_like_report_iterator(cli_args, enricher)

worked_on: Dict[int, FetchReportLikeItem] = {}
Expand Down
5 changes: 5 additions & 0 deletions minet/cli/scrape/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,11 @@ def writerow(row, item):
with LoadingBar(
"Scraping", unit="pages", total=reader.total if not cli_args.glob else None
) as loading_bar:
if not cli_args.has_dummy_csv and cli_args.input_dir is None:
from minet.cli.constants import DEFAULT_CONTENT_FOLDER

cli_args.input_dir = DEFAULT_CONTENT_FOLDER

items = create_fetch_like_report_iterator(cli_args, reader)

worked_on: Dict[int, FetchReportLikeItem] = {}
Expand Down
6 changes: 1 addition & 5 deletions minet/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

from minet.crawl import CrawlerState
from minet.encodings import is_supported_encoding
from minet.cli.constants import DEFAULT_CONTENT_FOLDER
from minet.cli.console import console
from minet.cli.loading_bar import LoadingBar, StatsItem
from minet.cli.exceptions import FatalError
Expand Down Expand Up @@ -190,10 +189,7 @@ def create_fetch_like_report_iterator(
cli_args: SimpleNamespace, reader: casanova.Reader
) -> Iterator[FetchReportLikeItem]:
headers = reader.headers
input_dir = cli_args.input_dir

if input_dir is None:
input_dir = DEFAULT_CONTENT_FOLDER
input_dir = cli_args.input_dir or ""

# TODO: deal with no_headers
assert headers is not None
Expand Down

0 comments on commit a190c5e

Please sign in to comment.