Skip to content

Commit

Permalink
fixed error 404 from tag search page 1 resulting in unsuccessful tag …
Browse files Browse the repository at this point in the history
…search
  • Loading branch information
9FS committed Sep 19, 2024
1 parent 1d3e614 commit 2bca8e0
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 47 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ license = "MIT"
name = "nhentai_archivist"
readme = "readme.md"
repository = "https://github.com/9-FS/nhentai_archivist"
version = "3.3.1"
version = "3.3.2"

[dependencies]
chrono = { version = "^0.4.0", features = ["serde"] }
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ version: "3"
services:
nhentai_archivist:
container_name: "nhentai_archivist"
image: "ghcr.io/9-fs/nhentai_archivist:3.3.1"
image: "ghcr.io/9-fs/nhentai_archivist:3.3.2"
environment:
HOST_OS: "Unraid"
PGID: 100
Expand Down
3 changes: 1 addition & 2 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ nHentai Archivist is not connected to your nHentai account in any way. Automatic
## Known Issues

- Searching by tags / downloading metadata often results in error 404 on seemingly random pages. This behaviour is consistent even when the URL is opened by a browser, so I assume the problem to be on nHentai's side.\
If error 404 occurs on page 1, you're out of luck as that page is required to get the total number of pages. In that case you will have to wait until the error disappears again or temporarily change the search query.\
In any other case just ignore the warnings and let nHentai Archivist search and download multiple times to get everything reliably, ideally with a `SLEEP_INTERVAL` of at least 50.000 so searches are guaranteed to be far enough apart. After a few runs, you will notice all but the newest hentai being skipped during the download phase. That's when you know you got everything. See [issue #3](https://github.com/9-FS/nhentai_archivist/issues/3).
Just ignore the warnings and let nHentai Archivist search and download multiple times to get everything reliably, ideally with a `SLEEP_INTERVAL` of at least 50.000 so searches are guaranteed to be far enough apart. After a few runs, you will notice all but the newest hentai being skipped during the download phase. That's when you know you got everything. See [issue #3](https://github.com/9-FS/nhentai_archivist/issues/3).

- nHentai contains a lot of duplicates. There is currently no way to filter them out. See [issue #6](https://github.com/9-FS/nhentai_archivist/issues/6).

Expand Down
26 changes: 6 additions & 20 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,45 +88,31 @@ pub enum SearchByIdError
}


#[derive(Debug, thiserror::Error)]
pub enum SearchByTagError
{
#[error("Downloading hentai metadata page 1 from \"{}\" failed with: {0}", .0.url().map_or_else(|| "<unknown>", |o| o.as_str()))]
Reqwest(#[from] reqwest::Error),

#[error("Downloading hentai metadata page 1 from \"{url}\" failed with status code {status}.")]
ReqwestStatus {url: String, status: reqwest::StatusCode},

#[error("Saving hentai metadata page 1 in database failed with: {0}")]
SerdeJson(#[from] serde_json::Error),
}


#[derive(Debug, thiserror::Error)]
pub enum SearchByTagOnPageError
{
#[error
(
"Downloading hentai metadata page {} / {} from \"{}\" failed with: {source}",
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*page_no),
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*num_pages),
num_pages.map_or("<unknown>".to_owned(), |o| scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(o)),
source.url().map_or_else(|| "<unknown>", |o| o.as_str())
)]
Reqwest {page_no: u32, num_pages: u32, source: reqwest::Error},
Reqwest {page_no: u32, num_pages: Option<u32>, source: reqwest::Error},

#[error
(
"Downloading hentai metadata page {} / {} from \"{url}\" failed with status code {status}.",
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*page_no),
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*num_pages),
num_pages.map_or("<unknown>".to_owned(), |o| scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(o)),
)]
ReqwestStatus {page_no: u32, num_pages: u32, url: String, status: reqwest::StatusCode},
ReqwestStatus {page_no: u32, num_pages: Option<u32>, url: String, status: reqwest::StatusCode},

#[error
(
"Saving hentai metadata page {} / {} in database failed with: {source}",
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*page_no),
scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(*num_pages),
num_pages.map_or("<unknown>".to_owned(), |o| scaler::Formatter::new().set_scaling(scaler::Scaling::None).set_rounding(scaler::Rounding::Magnitude(0)).format(o)),
)]
SerdeJson {page_no: u32, num_pages: u32, source: serde_json::Error},
SerdeJson {page_no: u32, num_pages: Option<u32>, source: serde_json::Error},
}
52 changes: 30 additions & 22 deletions src/search_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,37 +54,42 @@ pub async fn search_by_id(http_client: &reqwest::Client, nhentai_hentai_search_u
///
/// # Returns
/// - list of hentai ID to download or error
pub async fn search_by_tag(http_client: &reqwest::Client, nhentai_tag_search_url: &str, nhentai_tags: &Vec<String>, db: &sqlx::sqlite::SqlitePool) -> Result<Vec<u32>, SearchByTagError>
pub async fn search_by_tag(http_client: &reqwest::Client, nhentai_tag_search_url: &str, nhentai_tags: &Vec<String>, db: &sqlx::sqlite::SqlitePool) -> Result<Vec<u32>, SearchByTagOnPageError>
{
const WORKERS: usize = 2; // number of concurrent workers
let f = scaler::Formatter::new()
.set_scaling(scaler::Scaling::None)
.set_rounding(scaler::Rounding::Magnitude(0)); // formatter
let mut handles: Vec<tokio::task::JoinHandle<Option<Vec<u32>>>> = Vec::new(); // list of handles to tag_search_page
let mut hentai_id_list: Vec<u32> = Vec::new(); // list of hentai id to download
let r_serialised: TagSearchResponse; // response in json format
let mut num_pages: Option<u32> = None; // number of search result pages, at the beginning unknown
let worker_sem: std::sync::Arc<tokio::sync::Semaphore> = std::sync::Arc::new(tokio::sync::Semaphore::new(WORKERS)); // limit number of concurrent workers otherwise api enforces rate limit


let mut page_no: u32 = 1;
while page_no <= 10 // search first pages sequentially to try to get total number of pages
{
let r: reqwest::Response = http_client.get(format!("{nhentai_tag_search_url}?query={}&page=1", nhentai_tags.join("+"))).send().await?; // tag search, page, do not use .query() because it converts "+" between multiple tags to "%2B"
log::debug!("{}", r.url());
if r.status() != reqwest::StatusCode::OK {return Err(SearchByTagError::ReqwestStatus {url: r.url().to_string(), status: r.status()});} // if status is not ok: something went wrong
r_serialised = serde_json::from_str(r.text().await?.as_str())?; // deserialise json, get this response here to get number of pages before starting parallel workers
if let Err(e) = r_serialised.write_to_db(db).await // save data to database, if unsuccessful: warning
match search_by_tag_on_page(http_client.clone(), nhentai_tag_search_url.to_owned(), nhentai_tags.clone(), page_no, num_pages, db.clone()).await
{
log::warn!("Saving hentai metadata page 1 / {} in database failed with: {e}", f.format(r_serialised.num_pages));
Ok(o) =>
{
log::info!("Downloaded hentai metadata page {} / {}.", f.format(page_no), f.format(o.0));
num_pages = Some(o.0); // set number of pages
hentai_id_list.extend(o.1);
page_no += 1;
break; // initiate parallel search
}
Err(e) =>
{
if page_no < 10 {log::warn!("{e}");} // if not last page: only log error, retry with next page
else {return Err(e);} // if last page and still error: return error
}
}
log::info!("Downloaded hentai metadata page 1 / {}.", f.format(r_serialised.num_pages));
}

for hentai in r_serialised.result // collect hentai id
{
hentai_id_list.push(hentai.id);
page_no += 1;
}


for page_no in 2..=r_serialised.num_pages // for each page, search in parallel
for page_no in page_no..=num_pages.expect("num_pages is None even though made sure it should be initialised.") // continue with parallel search
{
let db_clone: sqlx::Pool<sqlx::Sqlite> = db.clone();
let f_clone: scaler::Formatter = f.clone();
Expand All @@ -96,11 +101,11 @@ pub async fn search_by_tag(http_client: &reqwest::Client, nhentai_tag_search_url
handles.push(tokio::spawn(async move
{
let result: Option<Vec<u32>>;
match search_by_tag_on_page(http_client_clone, nhentai_tag_search_url_clone, nhentai_tags_clone, page_no, r_serialised.num_pages, db_clone).await
match search_by_tag_on_page(http_client_clone, nhentai_tag_search_url_clone, nhentai_tags_clone, page_no, num_pages, db_clone).await
{
Ok(o) =>
Ok((_, o)) =>
{
log::info!("Downloaded hentai metadata page {} / {}.", f_clone.format(page_no), f_clone.format(r_serialised.num_pages));
log::info!("Downloaded hentai metadata page {} / {}.", f_clone.format(page_no), f_clone.format(num_pages.expect("num_pages is None even though made sure it should be initialised.")));
result = Some(o);
}
Err(e) =>
Expand Down Expand Up @@ -131,11 +136,14 @@ pub async fn search_by_tag(http_client: &reqwest::Client, nhentai_tag_search_url
/// - `nhentai_tag_search_url`: nhentai.net tag search api url
/// - `nhentai_tags`: tags to search for
/// - `page_no`: page number
/// - `num_pages`: number of search result pages, if already known
/// - `db`: database connection
///
/// # Returns
/// - list of hentai ID to download or error
async fn search_by_tag_on_page(http_client: reqwest::Client, nhentai_tag_search_url: String, nhentai_tags: Vec<String>, page_no: u32, num_pages: u32, db: sqlx::sqlite::SqlitePool) -> Result<Vec<u32>, SearchByTagOnPageError>
/// - number of search result pages
/// - list of hentai ID to download
/// - or error
async fn search_by_tag_on_page(http_client: reqwest::Client, nhentai_tag_search_url: String, nhentai_tags: Vec<String>, page_no: u32, num_pages: Option<u32>, db: sqlx::sqlite::SqlitePool) -> Result<(u32, Vec<u32>), SearchByTagOnPageError>
{
let f = scaler::Formatter::new()
.set_scaling(scaler::Scaling::None)
Expand Down Expand Up @@ -174,13 +182,13 @@ async fn search_by_tag_on_page(http_client: reqwest::Client, nhentai_tag_search_
}
if let Err(e) = r_serialised.write_to_db(&db).await // save data to database
{
log::warn!("Saving hentai metadata page {} / {} in database failed with: {e}", f.format(page_no), f.format(num_pages));
log::warn!("Saving hentai metadata page {} / {} in database failed with: {e}", f.format(page_no), num_pages.map_or("<unknown>".to_owned(), |o| f.format(o)));
}

for hentai in r_serialised.result // collect hentai id
{
hentai_id_list.push(hentai.id);
}

return Ok(hentai_id_list);
return Ok((r_serialised.num_pages, hentai_id_list));
}

0 comments on commit 2bca8e0

Please sign in to comment.