From bf72bac2fa74547f8b840f2b45a9adeea6735072 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 10 Feb 2025 10:37:01 -0500 Subject: [PATCH] chore(website): add direct proxy control --- Cargo.lock | 12 +++--- spider/Cargo.toml | 2 +- spider/src/configuration.rs | 45 +++++++++++++++++--- spider/src/features/chrome.rs | 25 ++++------- spider/src/website.rs | 70 +++++++------------------------ spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 10 files changed, 74 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f98200c2..e40d375f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5588,7 +5588,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.27.50" +version = "2.27.51" dependencies = [ "ahash", "aho-corasick", @@ -5653,7 +5653,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.27.50" +version = "2.27.51" dependencies = [ "adblock", "aho-corasick", @@ -5744,7 +5744,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.27.50" +version = "2.27.51" dependencies = [ "clap", "env_logger", @@ -5787,7 +5787,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.27.50" +version = "2.27.51" dependencies = [ "aho-corasick", "fast_html2md", @@ -5810,7 +5810,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.27.50" +version = "2.27.51" dependencies = [ "hashbrown 0.15.2", "indexmap 1.9.3", @@ -5827,7 +5827,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.27.50" +version = "2.27.51" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 5024239d5..b42baca3b 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.27.50" +version = "2.27.51" authors = [ "j-mendez " ] diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index daa01d243..b4dda9f41 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -44,6 +44,29 @@ pub type AllowList = Box; #[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))] pub struct AllowListSet(pub AllowList); +/// Determine allow proxy +#[derive(Debug, Default, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub enum ProxyIgnore { + /// Chrome proxy. + Chrome, + /// HTTP proxy. + Http, + #[default] + /// Do not ignore + No, +} + +/// The networking proxy to use. +#[derive(Debug, Default, Clone, PartialEq)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] +pub struct RequestProxy { + /// The proxy address. + pub addr: String, + /// Ignore the proxy when running a request type. + pub ignore: ProxyIgnore, +} + /// Structure to configure `Website` crawler /// ```rust /// use spider::website::Website; @@ -85,7 +108,7 @@ pub struct Configuration { /// Use HTTP2 for connection. Enable if you know the website has http2 support. pub http2_prior_knowledge: bool, /// Use proxy list for performing network request. - pub proxies: Option>>, + pub proxies: Option>>, /// Headers to include with request. pub headers: Option>, #[cfg(feature = "sitemap")] @@ -564,10 +587,22 @@ impl Configuration { /// Use proxies for request. pub fn with_proxies(&mut self, proxies: Option>) -> &mut Self { - match proxies { - Some(p) => self.proxies = Some(p.into()), - _ => self.proxies = None, - }; + self.proxies = proxies.map(|p| { + Box::new( + p.iter() + .map(|addr| RequestProxy { + addr: addr.to_owned(), + ..Default::default() + }) + .collect::>(), + ) + }); + self + } + + /// Use proxies for request with control between chrome and http. + pub fn with_proxies_direct(&mut self, proxies: Option>>) -> &mut Self { + self.proxies = proxies; self } diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index e706f0232..9a446a4d1 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -351,7 +351,13 @@ pub async fn launch_browser( if let Some(ref proxies) = config.proxies { let use_plain_http = proxies.len() >= 2; - for p in proxies.iter() { + for proxie in proxies.iter() { + if proxie.ignore == ProxyIgnore::Chrome { + continue; + } + + let p = proxie.addr; + if !p.is_empty() { // pick the socks:// proxy over http if found. if p.starts_with("socks://") { @@ -367,23 +373,6 @@ pub async fn launch_browser( create_content.proxy_bypass_list = Some("<-loopback>".into()); } - if p.starts_with("force_chrome_http://") { - create_content.proxy_server = - Some(p.replacen("force_chrome_http://", "http://", 1).into()); - break; - } - if p.starts_with("force_chrome_https://") { - create_content.proxy_server = - Some(p.replacen("force_chrome_https://", "https://", 1).into()); - break; - } - if p.starts_with("force_chrome_socks5://") { - create_content.proxy_server = Some( - p.replacen("force_chrome_socks5://", "socks5://", 1).into(), - ); - break; - } - create_content.proxy_server = Some(p.into()); } } diff --git a/spider/src/website.rs b/spider/src/website.rs index a85050734..f693f905d 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1146,6 +1146,8 @@ impl Website { fn configure_http_client_builder(&mut self) -> crate::ClientBuilder { use reqwest::header::HeaderMap; + use crate::configuration::ProxyIgnore; + let policy = self.setup_redirect_policy(); let mut headers: HeaderMap = HeaderMap::new(); @@ -1189,35 +1191,12 @@ impl Website { let replace_plain_socks = proxies.len() == 1 && linux; for proxie in proxies.iter() { - if proxie.starts_with("force_req_http://") { - if let Ok(proxy) = - reqwest::Proxy::all(&proxie.replacen("force_req_http://", "http://", 1)) - { - client = client.proxy(proxy); - } - break; - } - if proxie.starts_with("force_req_https://") { - if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen( - "force_req_https://", - "https://", - 1, - )) { - client = client.proxy(proxy); - } - break; - } - if proxie.starts_with("force_req_socks5://") { - if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen( - "force_req_socks5://", - "socks5://", - 1, - )) { - client = client.proxy(proxy); - } - break; + if proxie.ignore == ProxyIgnore::Http { + continue; } + let proxie = &proxie.addr; + let socks = proxie.starts_with("socks://"); // we can skip it and use another proxy from the list. @@ -1309,35 +1288,10 @@ impl Website { let replace_plain_socks = proxies.len() == 1 && linux; for proxie in proxies.iter() { - // special force proxy conditions. We should map the configs instead later. - if proxie.starts_with("force_req_http://") { - if let Ok(proxy) = - reqwest::Proxy::all(&proxie.replacen("force_req_http://", "http://", 1)) - { - client = client.proxy(proxy); - } - break; - } - if proxie.starts_with("force_req_https://") { - if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen( - "force_req_https://", - "https://", - 1, - )) { - client = client.proxy(proxy); - } - break; - } - if proxie.starts_with("force_req_socks5://") { - if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen( - "force_req_socks5://", - "socks5://", - 1, - )) { - client = client.proxy(proxy); - } - break; + if proxie.ignore == ProxyIgnore::Http { + continue; } + let proxie = &proxie.addr; let socks = proxie.starts_with("socks://"); @@ -4613,6 +4567,12 @@ impl Website { self } + /// Use proxies for request with control between chrome and http. + pub fn with_proxies_direct(&mut self, proxies: Option>>) -> &mut Self { + self.configuration.with_proxies_direct(proxies); + self + } + /// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n. pub fn with_concurrency_limit(&mut self, limit: Option) -> &mut Self { self.configuration.with_concurrency_limit(limit); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 2bd89ef4c..4fc6d9d4a 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.27.50" +version = "2.27.51" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 614a95313..19889eaab 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.27.50" +version = "2.27.51" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index c200d4b07..871874b6e 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.27.50" +version = "2.27.51" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 283aab917..84b66a180 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.27.50" +version = "2.27.51" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 73fc64dd0..54183cf2f 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.27.50" +version = "2.27.51" authors = [ "j-mendez " ]