Skip to content

Commit

Permalink
chore(website): add direct proxy control
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Feb 10, 2025
1 parent 19d030c commit bf72bac
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 90 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.27.50"
version = "2.27.51"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
45 changes: 40 additions & 5 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,29 @@ pub type AllowList = Box<regex::RegexSet>;
#[cfg_attr(not(feature = "regex"), derive(PartialEq, Eq))]
pub struct AllowListSet(pub AllowList);

/// Determine allow proxy
#[derive(Debug, Default, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum ProxyIgnore {
/// Chrome proxy.
Chrome,
/// HTTP proxy.
Http,
#[default]
/// Do not ignore
No,
}

/// The networking proxy to use.
#[derive(Debug, Default, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct RequestProxy {
/// The proxy address.
pub addr: String,
/// Ignore the proxy when running a request type.
pub ignore: ProxyIgnore,
}

/// Structure to configure `Website` crawler
/// ```rust
/// use spider::website::Website;
Expand Down Expand Up @@ -85,7 +108,7 @@ pub struct Configuration {
/// Use HTTP2 for connection. Enable if you know the website has http2 support.
pub http2_prior_knowledge: bool,
/// Use proxy list for performing network request.
pub proxies: Option<Box<Vec<String>>>,
pub proxies: Option<Box<Vec<RequestProxy>>>,
/// Headers to include with request.
pub headers: Option<Box<SerializableHeaderMap>>,
#[cfg(feature = "sitemap")]
Expand Down Expand Up @@ -564,10 +587,22 @@ impl Configuration {

/// Use proxies for request.
pub fn with_proxies(&mut self, proxies: Option<Vec<String>>) -> &mut Self {
match proxies {
Some(p) => self.proxies = Some(p.into()),
_ => self.proxies = None,
};
self.proxies = proxies.map(|p| {
Box::new(
p.iter()
.map(|addr| RequestProxy {
addr: addr.to_owned(),
..Default::default()
})
.collect::<Vec<RequestProxy>>(),
)
});
self
}

/// Use proxies for request with control between chrome and http.
pub fn with_proxies_direct(&mut self, proxies: Option<Box<Vec<RequestProxy>>>) -> &mut Self {
self.proxies = proxies;
self
}

Expand Down
25 changes: 7 additions & 18 deletions spider/src/features/chrome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,13 @@ pub async fn launch_browser(
if let Some(ref proxies) = config.proxies {
let use_plain_http = proxies.len() >= 2;

for p in proxies.iter() {
for proxie in proxies.iter() {
if proxie.ignore == ProxyIgnore::Chrome {
continue;
}

let p = proxie.addr;

if !p.is_empty() {
// pick the socks:// proxy over http if found.
if p.starts_with("socks://") {
Expand All @@ -367,23 +373,6 @@ pub async fn launch_browser(
create_content.proxy_bypass_list = Some("<-loopback>".into());
}

if p.starts_with("force_chrome_http://") {
create_content.proxy_server =
Some(p.replacen("force_chrome_http://", "http://", 1).into());
break;
}
if p.starts_with("force_chrome_https://") {
create_content.proxy_server =
Some(p.replacen("force_chrome_https://", "https://", 1).into());
break;
}
if p.starts_with("force_chrome_socks5://") {
create_content.proxy_server = Some(
p.replacen("force_chrome_socks5://", "socks5://", 1).into(),
);
break;
}

create_content.proxy_server = Some(p.into());
}
}
Expand Down
70 changes: 15 additions & 55 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1146,6 +1146,8 @@ impl Website {
fn configure_http_client_builder(&mut self) -> crate::ClientBuilder {
use reqwest::header::HeaderMap;

use crate::configuration::ProxyIgnore;

let policy = self.setup_redirect_policy();
let mut headers: HeaderMap = HeaderMap::new();

Expand Down Expand Up @@ -1189,35 +1191,12 @@ impl Website {
let replace_plain_socks = proxies.len() == 1 && linux;

for proxie in proxies.iter() {
if proxie.starts_with("force_req_http://") {
if let Ok(proxy) =
reqwest::Proxy::all(&proxie.replacen("force_req_http://", "http://", 1))
{
client = client.proxy(proxy);
}
break;
}
if proxie.starts_with("force_req_https://") {
if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen(
"force_req_https://",
"https://",
1,
)) {
client = client.proxy(proxy);
}
break;
}
if proxie.starts_with("force_req_socks5://") {
if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen(
"force_req_socks5://",
"socks5://",
1,
)) {
client = client.proxy(proxy);
}
break;
if proxie.ignore == ProxyIgnore::Http {
continue;
}

let proxie = &proxie.addr;

let socks = proxie.starts_with("socks://");

// we can skip it and use another proxy from the list.
Expand Down Expand Up @@ -1309,35 +1288,10 @@ impl Website {
let replace_plain_socks = proxies.len() == 1 && linux;

for proxie in proxies.iter() {
// special force proxy conditions. We should map the configs instead later.
if proxie.starts_with("force_req_http://") {
if let Ok(proxy) =
reqwest::Proxy::all(&proxie.replacen("force_req_http://", "http://", 1))
{
client = client.proxy(proxy);
}
break;
}
if proxie.starts_with("force_req_https://") {
if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen(
"force_req_https://",
"https://",
1,
)) {
client = client.proxy(proxy);
}
break;
}
if proxie.starts_with("force_req_socks5://") {
if let Ok(proxy) = reqwest::Proxy::all(&proxie.replacen(
"force_req_socks5://",
"socks5://",
1,
)) {
client = client.proxy(proxy);
}
break;
if proxie.ignore == ProxyIgnore::Http {
continue;
}
let proxie = &proxie.addr;

let socks = proxie.starts_with("socks://");

Expand Down Expand Up @@ -4613,6 +4567,12 @@ impl Website {
self
}

/// Use proxies for request with control between chrome and http.
pub fn with_proxies_direct(&mut self, proxies: Option<Box<Vec<crate::configuration::RequestProxy>>>) -> &mut Self {
self.configuration.with_proxies_direct(proxies);
self
}

/// Set the concurrency limits. If you set the value to None to use the default limits using the system CPU cors * n.
pub fn with_concurrency_limit(&mut self, limit: Option<usize>) -> &mut Self {
self.configuration.with_concurrency_limit(limit);
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.27.50"
version = "2.27.51"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.27.50"
version = "2.27.51"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.27.50"
version = "2.27.51"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.27.50"
version = "2.27.51"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.27.50"
version = "2.27.51"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit bf72bac

Please sign in to comment.