Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
carlos-a-g-h authored May 30, 2023
1 parent 2d58b4b commit 9169b0c
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 45 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,22 @@ ABUSIV is a recursive downloader for websites that do static file serving with a

- The **BaseDir** argument is the local directory where all the files/folders will be downloaded. THe base directory cannot match the program's directory
- The **AType** argument is the type of autoindex that is used by the website
- The **URL** argument is the starting URL, this URL cannot be a direct link from a file, it has to be a directory that leads to other files/directories
- The **URL** argument is the starting URL, this URL cannot be a direct link from a file, it has to be a directory that leads to other files and/or directories

## Available Autoindex types

- h5ai
- apache2
- h5ai

## Changelog

### 2023-05-29

- Made some rewrites (said bye bye to some mutable shared states) and changed some names
- Added some logging: The log file is saved in the base directory and any download failures will be logged as errors
- The output directory for all the downloaded content will be the initial directory (the initial directory name is taken from the initial URL) inside the base directory instead of the base directory itself
- While downloading a file or a page, you can hit Ctrl+C to skip it. In the case of skipping a file, the partially downloaded file will be eliminated and logged as an error

### 2023-05-27

- Bugfixes and added some guardrails
Expand Down
117 changes: 74 additions & 43 deletions abusiv.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import aiofiles
import aiohttp
import datetime
import logging
import yarl

from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import unquote as url_to_text

def get_ItemFromTag_apache2(tag,the_odir,reslist,path_origin):
def get_ItemFromTag_apache2(tag,the_odir,path_origin):
tag_td_icon=tag.find("td",attrs={"valign":"top"})
tag_a=tag.find("a")
if (not tag_td_icon) or (not tag_a):
return
return []

tag_img=tag_td_icon.find("img")
if not tag_img:
return
return []

the_type_raw=tag_img.get("alt")
if (not the_type_raw):
return
return []

if the_type_raw=="[DIR]":
fse_type="d"

if not the_type_raw=="[DIR]":
if the_type_raw=="[PARENTDIR]":
return
return []

fse_type="f"

Expand All @@ -37,38 +37,38 @@ def get_ItemFromTag_apache2(tag,the_odir,reslist,path_origin):

the_name=tag_a.text.strip()

reslist.append({"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name})
return [{"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name}]

def get_ItemFromTag_h5ai(tag,the_odir,reslist):
def get_ItemFromTag_h5ai(tag,the_odir):

tag_td_icon=tag.find("td",attrs={"class":"fb-i"})
tag_td_link=tag.find("td",attrs={"class":"fb-n"})
if (not tag_td_icon) or (not tag_td_link):
return
return []

tag_img=tag_td_icon.find("img")
if not tag_img:
return
return []

the_type_raw=tag_img.get("alt")
if (not the_type_raw) or (not the_type_raw in ["file","folder"]):
return
return []

fse_type={"file":"f","folder":"d"}[the_type_raw]

tag_a=tag_td_link.find("a")
if not tag_a:
return
return []

the_url=tag_a.get("href")
the_url=url_to_text(the_url)
the_name=tag_a.text.strip()

reslist.append({"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name})
return [{"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name}]

def get_TagsFromBTag(tags_all,url_curr,outdir,atype):

# Get relevant tags from master tag
# Get the item tags from big tag

if atype=="apache2":
tag_table=tags_all.find("table")
Expand Down Expand Up @@ -106,21 +106,22 @@ def get_TagsFromBTag(tags_all,url_curr,outdir,atype):

for tag in iter(tags_target):
if atype=="apache2":
get_ItemFromTag_apache2(tag,outdir,results,path_origin)
results.extend(get_ItemFromTag_apache2(tag,outdir,path_origin))

if atype=="h5ai":
get_ItemFromTag_h5ai(tag,outdir,results)
results.extend(get_ItemFromTag_h5ai(tag,outdir))

return results

async def download_page(session,url):
print(f"\n- Obtaining tags from: {url}")
try:
async with session.get(url) as response:
async with session.get(url,verify_ssl=False) as response:
#if not response.headers.get("Content-Type")=="text/html":
# raise Exception("Expected text/html content")
html_dump=await response.text()
except Exception as e:
logging.exception(f"#error {url}")
print(f" Error: {e}")
return None

Expand All @@ -129,29 +130,42 @@ async def download_page(session,url):

async def download_file(session,url,filepath):
print(f"\n- Downloading file\n URL: {url}\n Filepath: {filepath}")
if filepath.exists():
print(" The file already exists")
return

mb=1024*1024
filepath.parent.mkdir(parents=True,exist_ok=True)
unhandled=False
error=False
try:
async with session.get(url) as response:
if filepath.exists():
raise FileExistsError
async with session.get(url,verify_ssl=False) as response:
async with aiofiles.open(f"{filepath}","wb") as file:
while True:
chunk=await response.content.read(mb)
if not chunk:
break
await file.write(chunk)
except FileExistsError:
msg="The file already exists"
error=True
except KeyboardInterrupt:
if filepath.exists():
filepath.unlink()
msg="Skipped (and deleted)"
error=True
except Exception as e:
msg=f"Error: {e}"
msg=f"{e}"
logging.exception(msg)
unhandled=True
error=True
else:
msg="Ok"

if (not unhandled) and error:
logging.error(f"#error {msg} ; {url}")

print(f" {msg}")

async def processor(session,itemlist,yurl,atype):
item=itemlist.pop()
async def main_loop(session,item,yurl,atype):

print(f"\n- Processing the following item:\n {item}")

Expand All @@ -167,37 +181,39 @@ async def processor(session,itemlist,yurl,atype):

if item_type=="f":
await download_file(session,item_url,outpath)
return
return []

tags_all=await download_page(session,item_url)
if not tags_all:
return
return []

items_recovered=get_TagsFromBTag(tags_all,item_url,outpath,atype)
for item in items_recovered:
itemlist.append(item)
return get_TagsFromBTag(tags_all,item_url,outpath,atype)

async def manager(basedir_raw,atype,url_main):
#items_recovered=
#for item in items_recovered:
# itemlist.append(item)

async def main(basedir,atype,url_main):
yurl=yarl.URL(url_main)
session=aiohttp.ClientSession()
tags_all=await download_page(session,url_main)
if not tags_all:
return

basedir=Path(basedir_raw)
if basedir.exists():
if basedir.is_file():
print(f"\nERROR: The output path matches an existing file. Aborting now")
return
if atype in ("apache2","h5ai"):
root=(yurl.path=="/")
if not root:
firstdir=Path(yurl.path).name
if root:
firstdir=yurl.host

basedir.mkdir(parents=True,exist_ok=True)
outdir=basedir.joinpath(firstdir)

items=get_TagsFromBTag(tags_all,url_main,basedir,atype)
items=get_TagsFromBTag(tags_all,url_main,outdir,atype)
while True:
if len(items)==0:
break

await processor(session,items,yurl,atype)
items.extend(await main_loop(session,items.pop(),yurl,atype))

await session.close()

Expand All @@ -219,7 +235,7 @@ async def manager(basedir_raw,atype,url_main):
print("\nWritten by Carlos Alberto González Hernández\nVersion: 2023-05-27\n")
sys.exit(1)

bdir=sys.argv[1]
bdir_raw=sys.argv[1]
atype=sys.argv[2]
url=sys.argv[3]

Expand All @@ -234,10 +250,25 @@ async def manager(basedir_raw,atype,url_main):
sys.exit(1)

app_dir=Path(sys.argv[0]).parent
if app_dir.resolve()==Path(bdir).resolve():
if app_dir.resolve()==Path(bdir_raw).resolve():
print("\nERROR: Use a different directory")
sys.exit(1)

asyncio.run(manager(bdir,atype,url))
bdir=Path(bdir_raw)
if bdir.exists():
if bdir.is_file():
print(f"\nERROR: The output path matches an existing file")
sys.exit(1)

bdir.mkdir(parents=True,exist_ok=True)

log_name=f"{app_name}.txt"
log_path=str(bdir.joinpath(log_name))
logging.basicConfig(filename=str(log_path),format='[%(levelname) 5s/%(asctime)s] %(name)s %(funcName)s: %(msg)s',level=logging.INFO)
logging.info("# init")

asyncio.run(main(bdir,atype,url))

logging.info("# end")
print("\nProgram finished!")
sys.exit(0)

0 comments on commit 9169b0c

Please sign in to comment.