From 18b29c520509b2fa3b72eab052899a52e0e298fa Mon Sep 17 00:00:00 2001 From: Robert Pendell Date: Thu, 24 Oct 2024 13:29:11 -0400 Subject: [PATCH 1/5] Add web archive as a final fallback This implements the web archive as a potential fallback for when the archive is no longer available during a batch download. This does not allow for downloading removed archives if you do not already have the image metadata in your local database unless you add that data yourself. --- src/hentai.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/hentai.rs b/src/hentai.rs index 05ebec8..3cff66e 100644 --- a/src/hentai.rs +++ b/src/hentai.rs @@ -331,6 +331,7 @@ impl Hentai let mut r: reqwest::Response = http_client.get(image_url).send().await?; // tag search on general media server, page + if r.status() != reqwest::StatusCode::OK // if status not ok: retry with other media servers { for media_server in MEDIA_SERVERS // try all media servers @@ -341,6 +342,15 @@ impl Hentai if r.status() == reqwest::StatusCode::OK {break;} // if not ok: try again } } + + if r.status() != reqwest::StatusCode::OK // finally try with archive.org + { + log::warn!("Pulling from the Internet Archive: {image_url}"); + log::debug!("{}", image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str())); + r = http_client.get(image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str())).send().await?; // tag search, page, insert media server + log::debug!("{}", r.status()); + } + if r.status() != reqwest::StatusCode::OK {return Err(HentaiDownloadImageError::ReqwestStatus {url: image_url.to_owned(), status: r.status()});} // if status still not ok: something went wrong let mut file: tokio::fs::File; From 14ab00b996b5ca78c76c79760cf83869b5d49736 Mon Sep 17 00:00:00 2001 From: Robert Pendell Date: Thu, 24 Oct 2024 16:18:24 -0400 Subject: [PATCH 2/5] Wrap internet archive code in an option check Add ARCHIVE_ORG optional config to enable this functionality. If image does not exist on internet archive you will still get a 404. If they are down you will get a random different error. The functionality is disabled by default. --- src/config.rs | 2 ++ src/hentai.rs | 10 ++++++---- src/main_inner.rs | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/config.rs b/src/config.rs index 8147a17..2428102 100644 --- a/src/config.rs +++ b/src/config.rs @@ -19,6 +19,7 @@ pub struct Config pub NHENTAI_TAGS: Option>, // keep creating downloadme.txt from these tags and keep downloading (server mode), normal tags are in format "tag:{tag}" for example "tag:ffm-threesome"; if None: don't generate downloadme.txt, download hentai once (client mode) pub SLEEP_INTERVAL: Option, // sleep interval in seconds between checking for new hentai to download (server mode) pub USER_AGENT: Option, // bypass bot protection + pub ARCHIVE_ORG: Option, // Allow pull from archive.org? False by default } impl Default for Config @@ -39,6 +40,7 @@ impl Default for Config NHENTAI_TAGS: None, SLEEP_INTERVAL: Some(50000), USER_AGENT: Some("".to_owned()), + ARCHIVE_ORG: None, } } } \ No newline at end of file diff --git a/src/hentai.rs b/src/hentai.rs index 3cff66e..52aa4fb 100644 --- a/src/hentai.rs +++ b/src/hentai.rs @@ -130,10 +130,11 @@ impl Hentai /// # Arguments /// - `http_client`: reqwest http client /// - `db`: database connectionc + /// - `webarchive`: Download from web archive? False by default. /// /// # Returns /// - nothing or error - pub async fn download(&self, http_client: &reqwest::Client, cleanup_temporary_files: bool) -> Result<(), HentaiDownloadError> + pub async fn download(&self, http_client: &reqwest::Client, cleanup_temporary_files: bool, webarchive: bool) -> Result<(), HentaiDownloadError> { const WORKERS: usize = 5; // number of parallel workers let cbz_final_filepath: String; //filepath to final cbz in library @@ -195,7 +196,7 @@ impl Hentai handles.push(tokio::spawn(async move { let result: Option<()>; - match Self::download_image(&http_client_clone, &image_url_clone, &image_filepath).await // download image + match Self::download_image(&http_client_clone, &image_url_clone, &image_filepath, webarchive).await // download image { Ok(_) => { @@ -315,10 +316,11 @@ impl Hentai /// - `http_client`: reqwest http client /// - `image_url`: url of the image to download /// - `image_filepath`: path to save the image to + /// - `webarchive`: Download from web archive? False by default. /// /// # Returns /// - nothing or error - async fn download_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str) -> Result<(), HentaiDownloadImageError> + async fn download_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str, webarchive: bool) -> Result<(), HentaiDownloadImageError> { const MEDIA_SERVERS: [u8; 4] = [2, 3, 5, 7]; // media servers to try if image not found, general first, after that explicit @@ -343,7 +345,7 @@ impl Hentai } } - if r.status() != reqwest::StatusCode::OK // finally try with archive.org + if r.status() != reqwest::StatusCode::OK && webarchive == true // finally try with archive.org but only if the user opted to. False by default. { log::warn!("Pulling from the Internet Archive: {image_url}"); log::debug!("{}", image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str())); diff --git a/src/main_inner.rs b/src/main_inner.rs index 3c4ded5..dec1526 100644 --- a/src/main_inner.rs +++ b/src/main_inner.rs @@ -114,7 +114,7 @@ pub async fn main_inner(config: Config) -> Result<(), Error> } } - if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true)).await + if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true), config.ARCHIVE_ORG.unwrap_or(false)).await { log::error!{"{e}"}; } From a119977ad9e05eed001a15b06872d70a4ca2b04a Mon Sep 17 00:00:00 2001 From: Robert Pendell Date: Sun, 27 Oct 2024 02:28:00 -0400 Subject: [PATCH 3/5] Relocate web archive pull Moved the pull so it isn't part of the main loop. I did duplicate the download image function with the only part removing the server loop and replacing the image download part with the one that rewrites and attempts to download the web archive version. Previous version of the code tried to download from the archvie 5 times (once for each attempt pass) and I didn't realize this at first. This one only does it once and only after the 5 tries have been tried with the regular servers. Ideally I'd like to reduce the code duplicate (or eliminate outright) and have it abort the archive attempt at the first error. At least it is better about it now. --- src/hentai.rs | 99 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/src/hentai.rs b/src/hentai.rs index 52aa4fb..2820c75 100644 --- a/src/hentai.rs +++ b/src/hentai.rs @@ -196,7 +196,7 @@ impl Hentai handles.push(tokio::spawn(async move { let result: Option<()>; - match Self::download_image(&http_client_clone, &image_url_clone, &image_filepath, webarchive).await // download image + match Self::download_image(&http_client_clone, &image_url_clone, &image_filepath).await // download image { Ok(_) => { @@ -219,6 +219,44 @@ impl Hentai } if image_download_success {break;} // if all images were downloaded successfully: continue with cbz creation } + if !image_download_success && webarchive == true { // Web Archive Loop + image_download_success = true; // assume success + handles = Vec::new(); // reset handles + + for i in 0..self.images_url.len() // for each page + { + let f_clone: scaler::Formatter = f.clone(); + let http_client_clone: reqwest::Client = http_client.clone(); + let image_filepath: String = format!("{}{}/{}", self.library_path, self.id, self.images_filename.get(i).expect("Index out of bounds even though should have same size as images_url.")); + let image_url_clone: String = self.images_url.get(i).expect("Index out of bounds even though checked before that it fits.").clone(); + let num_pages_clone: u16 = self.num_pages; + + let permit: tokio::sync::OwnedSemaphorePermit = worker_sem.clone().acquire_owned().await.expect("Something closed semaphore even though it should never be closed."); // acquire semaphore + handles.push(tokio::spawn(async move + { + let result: Option<()>; + match Self::archive_image(&http_client_clone, &image_url_clone, &image_filepath).await // download image + { + Ok(_) => + { + log::debug!("Downloaded hentai image {} / {}.", f_clone.format((i+1) as f64), f_clone.format(num_pages_clone as f64)); + result = Some(()); // success + } + Err(e) => + { + log::warn!("{e}"); + result = None; // failure + } + } + drop(permit); // release semaphore + result // return result into handle + })); // search all pages in parallel + } + for handle in handles + { + if handle.await.unwrap().is_none() {image_download_success = false;} // collect results, forward panics, abort so we don't needlessly spam IA on a set that won't download + } + } if !image_download_success {return Err(HentaiDownloadError::Download {})}; // if after 5 attempts still not all images downloaded successfully: give up log::info!("Downloaded hentai images."); @@ -316,11 +354,10 @@ impl Hentai /// - `http_client`: reqwest http client /// - `image_url`: url of the image to download /// - `image_filepath`: path to save the image to - /// - `webarchive`: Download from web archive? False by default. /// /// # Returns /// - nothing or error - async fn download_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str, webarchive: bool) -> Result<(), HentaiDownloadImageError> + async fn download_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str) -> Result<(), HentaiDownloadImageError> { const MEDIA_SERVERS: [u8; 4] = [2, 3, 5, 7]; // media servers to try if image not found, general first, after that explicit @@ -345,7 +382,61 @@ impl Hentai } } - if r.status() != reqwest::StatusCode::OK && webarchive == true // finally try with archive.org but only if the user opted to. False by default. + if r.status() != reqwest::StatusCode::OK {return Err(HentaiDownloadImageError::ReqwestStatus {url: image_url.to_owned(), status: r.status()});} // if status still not ok: something went wrong + + let mut file: tokio::fs::File; + #[cfg(target_family = "unix")] + { + if let Some(parent) = std::path::Path::new(image_filepath).parent() // create all parent directories with permissions "drwxrwxrwx" + { + if let Err(e) = tokio::fs::DirBuilder::new().recursive(true).mode(0o777).create(parent).await + { + return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e}); + } + } + match tokio::fs::OpenOptions::new().create_new(true).mode(0o666).write(true).open(image_filepath).await + { + Ok(o) => file = o, + Err(e) => {return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});} + } + } + #[cfg(not(target_family = "unix"))] + { + if let Some(parent) = std::path::Path::new(image_filepath).parent() // create all parent directories + { + if let Err(e) = tokio::fs::DirBuilder::new().recursive(true).create(parent).await + { + return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e}); + } + } + match tokio::fs::OpenOptions::new().create_new(true).write(true).open(image_filepath).await + { + Ok(o) => file = o, + Err(e) => {return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e});} + } + } + + if let Err(e) = file.write_all_buf(&mut r.bytes().await?).await // save image with permissions "rw-rw-rw-" + { + return Err(HentaiDownloadImageError::StdIo {filepath: image_filepath.to_owned(), source: e}); + } + + return Ok(()); + } + + async fn archive_image(http_client: &reqwest::Client, image_url: &str, image_filepath: &str) -> Result<(), HentaiDownloadImageError> + { + + if let Ok(o) = tokio::fs::metadata(image_filepath).await + { + if o.is_file() {return Ok(());} // if image already exists: skip download + if o.is_dir() {return Err(HentaiDownloadImageError::BlockedByDirectory {directory_path: image_filepath.to_owned()});} // if image filepath blocked by directory: give up + } + + + let mut r: reqwest::Response = http_client.get(image_url).send().await?; // tag search on general media server, page + + if r.status() != reqwest::StatusCode::OK // if status not ok: retry with other media servers { log::warn!("Pulling from the Internet Archive: {image_url}"); log::debug!("{}", image_url.replace("https://i.nhentai.net", format!("https://web.archive.org/web/00000000000000if_/https://i.nhentai.net").as_str())); From 9f524af025cc0194c6a981cc62838c1ba8dbb910 Mon Sep 17 00:00:00 2001 From: Robert Pendell Date: Sun, 27 Oct 2024 11:29:14 -0400 Subject: [PATCH 4/5] Rename ARCHIVE_ORG to FALLBACK_TO_ARCHIVE_ORG --- src/config.rs | 4 ++-- src/main_inner.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/config.rs b/src/config.rs index 2428102..4245bed 100644 --- a/src/config.rs +++ b/src/config.rs @@ -14,12 +14,12 @@ pub struct Config pub DEBUG: Option, // debug mode? pub DONTDOWNLOADME_FILEPATH: Option, // path to file containing hentai ID to not download, blacklist pub DOWNLOADME_FILEPATH: Option, // path to file containing hentai ID to download + pub FALLBACK_TO_ARCHIVE_ORG: Option, // Allow pull from archive.org? False by default pub LIBRARY_PATH: String, // path to download hentai to pub LIBRARY_SPLIT: Option, // split library into subdirectories of maximum this many hentai, None or 0 to disable pub NHENTAI_TAGS: Option>, // keep creating downloadme.txt from these tags and keep downloading (server mode), normal tags are in format "tag:{tag}" for example "tag:ffm-threesome"; if None: don't generate downloadme.txt, download hentai once (client mode) pub SLEEP_INTERVAL: Option, // sleep interval in seconds between checking for new hentai to download (server mode) pub USER_AGENT: Option, // bypass bot protection - pub ARCHIVE_ORG: Option, // Allow pull from archive.org? False by default } impl Default for Config @@ -40,7 +40,7 @@ impl Default for Config NHENTAI_TAGS: None, SLEEP_INTERVAL: Some(50000), USER_AGENT: Some("".to_owned()), - ARCHIVE_ORG: None, + FALLBACK_TO_ARCHIVE_ORG: None, } } } \ No newline at end of file diff --git a/src/main_inner.rs b/src/main_inner.rs index dec1526..4a61cf8 100644 --- a/src/main_inner.rs +++ b/src/main_inner.rs @@ -114,7 +114,7 @@ pub async fn main_inner(config: Config) -> Result<(), Error> } } - if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true), config.ARCHIVE_ORG.unwrap_or(false)).await + if let Err(e) = hentai.download(&http_client, config.CLEANUP_TEMPORARY_FILES.unwrap_or(true), config.FALLBACK_TO_ARCHIVE_ORG.unwrap_or(false)).await { log::error!{"{e}"}; } From 77a4f24778bce0a9e5bd6d95005bf362bf903d80 Mon Sep 17 00:00:00 2001 From: Robert Pendell Date: Sun, 27 Oct 2024 11:30:17 -0400 Subject: [PATCH 5/5] Comment... --- src/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index 4245bed..4bdcc91 100644 --- a/src/config.rs +++ b/src/config.rs @@ -14,7 +14,7 @@ pub struct Config pub DEBUG: Option, // debug mode? pub DONTDOWNLOADME_FILEPATH: Option, // path to file containing hentai ID to not download, blacklist pub DOWNLOADME_FILEPATH: Option, // path to file containing hentai ID to download - pub FALLBACK_TO_ARCHIVE_ORG: Option, // Allow pull from archive.org? False by default + pub FALLBACK_TO_ARCHIVE_ORG: Option, // allow pull from archive.org? false by default pub LIBRARY_PATH: String, // path to download hentai to pub LIBRARY_SPLIT: Option, // split library into subdirectories of maximum this many hentai, None or 0 to disable pub NHENTAI_TAGS: Option>, // keep creating downloadme.txt from these tags and keep downloading (server mode), normal tags are in format "tag:{tag}" for example "tag:ffm-threesome"; if None: don't generate downloadme.txt, download hentai once (client mode)