Add files via upload

carlos-a-g-h · May 30, 2023 · 9169b0c · 9169b0c
1 parent 2d58b4b
commit 9169b0c
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -10,15 +10,22 @@ ABUSIV is a recursive downloader for websites that do static file serving with a
 
 - The **BaseDir** argument is the local directory where all the files/folders will be downloaded. THe base directory cannot match the program's directory
 - The **AType** argument is the type of autoindex that is used by the website
-- The **URL** argument is the starting URL, this URL cannot be a direct link from a file, it has to be a directory that leads to other files/directories
+- The **URL** argument is the starting URL, this URL cannot be a direct link from a file, it has to be a directory that leads to other files and/or directories
 
 ## Available Autoindex types
 
-- h5ai
 - apache2
+- h5ai
 
 ## Changelog
 
+### 2023-05-29
+
+- Made some rewrites (said bye bye to some mutable shared states) and changed some names
+- Added some logging: The log file is saved in the base directory and any download failures will be logged as errors
+- The output directory for all the downloaded content will be the initial directory (the initial directory name is taken from the initial URL) inside the base directory instead of the base directory itself
+- While downloading a file or a page, you can hit Ctrl+C to skip it. In the case of skipping a file, the partially downloaded file will be eliminated and logged as an error
+
 ### 2023-05-27
 
 - Bugfixes and added some guardrails

diff --git a/abusiv.py b/abusiv.py
@@ -1,32 +1,32 @@
 import aiofiles
 import aiohttp
-import datetime
+import logging
 import yarl
 
 from bs4 import BeautifulSoup
 from pathlib import Path
 from urllib.parse import unquote as url_to_text
 
-def get_ItemFromTag_apache2(tag,the_odir,reslist,path_origin):
+def get_ItemFromTag_apache2(tag,the_odir,path_origin):
 	tag_td_icon=tag.find("td",attrs={"valign":"top"})
 	tag_a=tag.find("a")
 	if (not tag_td_icon) or (not tag_a):
-		return
+		return []
 
 	tag_img=tag_td_icon.find("img")
 	if not tag_img:
-		return
+		return []
 
 	the_type_raw=tag_img.get("alt")
 	if (not the_type_raw):
-		return
+		return []
 
 	if the_type_raw=="[DIR]":
 		fse_type="d"
 
 	if not the_type_raw=="[DIR]":
 		if the_type_raw=="[PARENTDIR]":
-			return
+			return []
 
 		fse_type="f"
 
@@ -37,38 +37,38 @@ def get_ItemFromTag_apache2(tag,the_odir,reslist,path_origin):
 
 	the_name=tag_a.text.strip()
 
-	reslist.append({"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name})
+	return [{"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name}]
 
-def get_ItemFromTag_h5ai(tag,the_odir,reslist):
+def get_ItemFromTag_h5ai(tag,the_odir):
 
 	tag_td_icon=tag.find("td",attrs={"class":"fb-i"})
 	tag_td_link=tag.find("td",attrs={"class":"fb-n"})
 	if (not tag_td_icon) or (not tag_td_link):
-		return
+		return []
 
 	tag_img=tag_td_icon.find("img")
 	if not tag_img:
-		return
+		return []
 
 	the_type_raw=tag_img.get("alt")
 	if (not the_type_raw) or (not the_type_raw in ["file","folder"]):
-		return
+		return []
 
 	fse_type={"file":"f","folder":"d"}[the_type_raw]
 
 	tag_a=tag_td_link.find("a")
 	if not tag_a:
-		return
+		return []
 
 	the_url=tag_a.get("href")
 	the_url=url_to_text(the_url)
 	the_name=tag_a.text.strip()
 
-	reslist.append({"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name})
+	return [{"type":fse_type,"url":the_url,"odir":the_odir,"name":the_name}]
 
 def get_TagsFromBTag(tags_all,url_curr,outdir,atype):
 
-	# Get relevant tags from master tag
+	# Get the item tags from big tag
 
 	if atype=="apache2":
 		tag_table=tags_all.find("table")
@@ -106,21 +106,22 @@ def get_TagsFromBTag(tags_all,url_curr,outdir,atype):
 
 	for tag in iter(tags_target):
 		if atype=="apache2":
-			get_ItemFromTag_apache2(tag,outdir,results,path_origin)
+			results.extend(get_ItemFromTag_apache2(tag,outdir,path_origin))
 
 		if atype=="h5ai":
-			get_ItemFromTag_h5ai(tag,outdir,results)
+			results.extend(get_ItemFromTag_h5ai(tag,outdir))
 
 	return results
 
 async def download_page(session,url):
 	print(f"\n- Obtaining tags from: {url}")
 	try:
-		async with session.get(url) as response:
+		async with session.get(url,verify_ssl=False) as response:
 			#if not response.headers.get("Content-Type")=="text/html":
 			#	raise Exception("Expected text/html content")
 			html_dump=await response.text()
 	except Exception as e:
+		logging.exception(f"#error {url}")
 		print(f"  Error: {e}")
 		return None
 
@@ -129,29 +130,42 @@ async def download_page(session,url):
 
 async def download_file(session,url,filepath):
 	print(f"\n- Downloading file\n  URL: {url}\n  Filepath: {filepath}")
-	if filepath.exists():
-		print("  The file already exists")
-		return
-
 	mb=1024*1024
 	filepath.parent.mkdir(parents=True,exist_ok=True)
+	unhandled=False
+	error=False
 	try:
-		async with session.get(url) as response:
+		if filepath.exists():
+			raise FileExistsError
+		async with session.get(url,verify_ssl=False) as response:
 			async with aiofiles.open(f"{filepath}","wb") as file:
 				while True:
 					chunk=await response.content.read(mb)
 					if not chunk:
 						break
 					await file.write(chunk)
+	except FileExistsError:
+		msg="The file already exists"
+		error=True
+	except KeyboardInterrupt:
+		if filepath.exists():
+			filepath.unlink()
+		msg="Skipped (and deleted)"
+		error=True
 	except Exception as e:
-		msg=f"Error: {e}"
+		msg=f"{e}"
+		logging.exception(msg)
+		unhandled=True
+		error=True
 	else:
 		msg="Ok"
 
+	if (not unhandled) and error:
+		logging.error(f"#error {msg} ; {url}")
+
 	print(f"  {msg}")
 
-async def processor(session,itemlist,yurl,atype):
-	item=itemlist.pop()
+async def main_loop(session,item,yurl,atype):
 
 	print(f"\n- Processing the following item:\n  {item}")
 
@@ -167,37 +181,39 @@ async def processor(session,itemlist,yurl,atype):
 
 	if item_type=="f":
 		await download_file(session,item_url,outpath)
-		return
+		return []
 
 	tags_all=await download_page(session,item_url)
 	if not tags_all:
-		return
+		return []
 
-	items_recovered=get_TagsFromBTag(tags_all,item_url,outpath,atype)
-	for item in items_recovered:
-		itemlist.append(item)
+	return get_TagsFromBTag(tags_all,item_url,outpath,atype)
 
-async def manager(basedir_raw,atype,url_main):
+	#items_recovered=
+	#for item in items_recovered:
+	#	itemlist.append(item)
+
+async def main(basedir,atype,url_main):
 	yurl=yarl.URL(url_main)
 	session=aiohttp.ClientSession()
 	tags_all=await download_page(session,url_main)
 	if not tags_all:
 		return
 
-	basedir=Path(basedir_raw)
-	if basedir.exists():
-		if basedir.is_file():
-			print(f"\nERROR: The output path matches an existing file. Aborting now")
-			return
+	if atype in ("apache2","h5ai"):
+		root=(yurl.path=="/")
+		if not root:
+			firstdir=Path(yurl.path).name
+		if root:
+			firstdir=yurl.host
 
-	basedir.mkdir(parents=True,exist_ok=True)
+		outdir=basedir.joinpath(firstdir)
 
-	items=get_TagsFromBTag(tags_all,url_main,basedir,atype)
+	items=get_TagsFromBTag(tags_all,url_main,outdir,atype)
 	while True:
 		if len(items)==0:
 			break
-
-		await processor(session,items,yurl,atype)
+		items.extend(await main_loop(session,items.pop(),yurl,atype))
 
 	await session.close()
 
@@ -219,7 +235,7 @@ async def manager(basedir_raw,atype,url_main):
 		print("\nWritten by Carlos Alberto González Hernández\nVersion: 2023-05-27\n")
 		sys.exit(1)
 
-	bdir=sys.argv[1]
+	bdir_raw=sys.argv[1]
 	atype=sys.argv[2]
 	url=sys.argv[3]
 
@@ -234,10 +250,25 @@ async def manager(basedir_raw,atype,url_main):
 		sys.exit(1)
 
 	app_dir=Path(sys.argv[0]).parent
-	if app_dir.resolve()==Path(bdir).resolve():
+	if app_dir.resolve()==Path(bdir_raw).resolve():
 		print("\nERROR: Use a different directory")
 		sys.exit(1)
 
-	asyncio.run(manager(bdir,atype,url))
+	bdir=Path(bdir_raw)
+	if bdir.exists():
+		if bdir.is_file():
+			print(f"\nERROR: The output path matches an existing file")
+			sys.exit(1)
+
+	bdir.mkdir(parents=True,exist_ok=True)
+
+	log_name=f"{app_name}.txt"
+	log_path=str(bdir.joinpath(log_name))
+	logging.basicConfig(filename=str(log_path),format='[%(levelname) 5s/%(asctime)s] %(name)s %(funcName)s: %(msg)s',level=logging.INFO)
+	logging.info("# init")
+
+	asyncio.run(main(bdir,atype,url))
+
+	logging.info("# end")
 	print("\nProgram finished!")
 	sys.exit(0)