Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a fetcher that uses a real Chrome browser to download the html #237

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions fetcher/chrome-fetcher/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
plugins {
buildsrc.convention.`kotlin-jvm`
buildsrc.convention.`publish-jvm`
buildsrc.convention.kover
}

dependencies {
api(projects.fetcher.baseFetcher)
api(Deps.htmlUnit) {
exclude("org.eclipse.jetty.websocket") // avoid android crash; see #93
}
api(Deps.logback)
api(Deps.log4jOverSlf4j)
api("io.fluidsonic.mirror:cdt-java-client:4.0.0-fluidsonic-1")

testImplementation(projects.testUtils)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
package it.skrape.fetcher

public class ChromeException(msg: String) : Exception(msg)
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package it.skrape.fetcher

import com.github.kklisura.cdt.launch.ChromeArguments
import com.github.kklisura.cdt.launch.ChromeLauncher
import com.github.kklisura.cdt.protocol.events.network.LoadingFinished
import com.github.kklisura.cdt.protocol.support.types.EventHandler
import com.github.kklisura.cdt.services.ChromeService
import org.htmlunit.org.apache.http.HttpStatus

public object ChromeFetcher : BlockingFetcher<Request> {
override val requestBuilder: Request get() = Request()

override fun fetch(request: Request): Result {
val chromeArgs = ChromeArguments.defaults(true)
.additionalArguments("no-sandbox", true)
.additionalArguments("remote-allow-origins", "*")
.build()

// Start Chrome
val launcher = ChromeLauncher()
val chromeService: ChromeService = launcher.launch(chromeArgs)
val tab = chromeService.createTab()
val devToolsService = chromeService.createDevToolsService(tab)

val page = devToolsService.page
val runtime = devToolsService.runtime

var result: Result? = null

// Wait for on load event
page.onLoadEventFired { _ ->
val evaluation = runtime.evaluate("document.documentElement.outerHTML")

result = Result(
responseBody = evaluation.result.value.toString(),
responseStatus = Result.Status(HttpStatus.SC_OK, ""),
contentType = "",
headers = emptyMap(),
baseUri = request.url,
cookies = emptyList()
)

devToolsService.close()
}

page.enable()

// Navigate to the page in question
page.navigate(request.url)

devToolsService.waitUntilClosed()
chromeService.closeTab(tab)

if (result == null) {
throw ChromeException("No result found")
}

return result as Result
}
}
18 changes: 18 additions & 0 deletions fetcher/chrome-fetcher/src/main/resources/logback.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<configuration debug="false">

<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>false</withJansi>

<encoder>
<pattern>%highlight(%.-1level) %date{HH:mm:ss.SSS} [%30.30logger] %msg%n</pattern>
</encoder>
</appender>

<!-- turning down htmlunit logging -->
<logger name="com.gargoylesoftware.htmlunit" level="OFF"/>
<logger name="org.apache.http" level="ERROR"/>

<root level="INFO">
<appender-ref ref="console"/>
</root>
</configuration>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package it.skrape.fetcher

import Testcontainer
import com.gargoylesoftware.htmlunit.util.NameValuePair
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.condition.DisabledOnOs
import org.junit.jupiter.api.condition.OS
import org.junit.jupiter.api.parallel.Execution
import org.junit.jupiter.api.parallel.ExecutionMode
import setupCookiesStub
import setupPostStub
import setupRedirect
import setupStub
import strikt.api.expect
import strikt.api.expectThat
import strikt.api.expectThrows
import strikt.assertions.contains
import strikt.assertions.isEqualTo
import java.net.SocketTimeoutException

private val wiremock = Testcontainer.wiremock
private val httpBin = Testcontainer.httpBin

@Execution(ExecutionMode.SAME_THREAD)
@DisabledOnOs(OS.WINDOWS)
class ChromeFetcherTest {

private val baseRequest by lazy { Request(url = wiremock.httpUrl) }

@Test
fun `will fetch localhost 8080 with defaults if no params`() {
wiremock.setupStub()

val fetched = ChromeFetcher.fetch(baseRequest)

expect {
that(fetched.status { code }).isEqualTo(200)
that(fetched.responseBody).contains("i'm the title")
}
}

@Test
fun `can fetch url and use HTTP verb GET by default`() {
wiremock.setupStub(path = "/example")

val request = baseRequest.copy(
url = "${wiremock.httpUrl}/example",
sslRelaxed = true
)

val fetched = ChromeFetcher.fetch(request)

expect {
that(fetched.status { code }).isEqualTo(200)
that(fetched.responseBody).contains("i'm the title")
}
}

@Test
fun `can parse js rendered elements`() {
wiremock.setupStub(fileName = "js.html")

val fetched = ChromeFetcher.fetch(baseRequest)

expectThat(fetched.responseBody).contains("I have been dynamically added via Javascript")
}

@Test
fun `can parse js rendered elements from https page`() {
wiremock.setupStub(fileName = "js.html")
val request = baseRequest.copy(
url = wiremock.httpUrl,
sslRelaxed = true
)

val fetched = ChromeFetcher.fetch(request)

expectThat(fetched.responseBody).contains("I have been dynamically added via Javascript")
}

@Test
fun `can parse es6 rendered elements from https page`() {
wiremock.setupStub(fileName = "es6.html")

val fetched = ChromeFetcher.fetch(baseRequest)
expectThat(fetched.responseBody).contains("dynamically added")
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"data":"some value"}
16 changes: 16 additions & 0 deletions fetcher/chrome-fetcher/src/test/resources/__files/es6.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>i'm the title</title>
</head>
<body>
i'm the body
<h1>i'm the headline</h1>
<p>i'm a paragraph</p>
<p>i'm a second paragraph</p>
</body>
<script>
const getNodesOf = (selector) => document.querySelectorAll(selector);
getNodesOf("p").forEach(p => p.innerHTML = "<span>dynamically added</span>")
</script>
</html>
28 changes: 28 additions & 0 deletions fetcher/chrome-fetcher/src/test/resources/__files/example.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>i'm the title</title>
</head>
<body>
i'm the body
<h1>i'm the headline</h1>
<header>
<h1>i'm the headers headline</h1>
</header>
<p class="foo bar fizz buzz" data-foo="bar">i'm a paragraph</p>
<p>i'm a second paragraph</p>
<div>
first div
<div>first divs child div</div>
</div>
<div>
second div
<div>second divs child div</div>
</div>
<div class="foo bar fizz buzz">div with class foo</div>
<a-custom-tag>i'm a custom html5 tag</a-custom-tag>
<a href="http://some.url">first link</a>
<a href="http://some-other.url">second link</a>
<a href="/relative-link">relative link</a>
</body>
</html>
19 changes: 19 additions & 0 deletions fetcher/chrome-fetcher/src/test/resources/__files/js.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>i'm the title</title>
</head>
<body>
i'm the body
<h1>i'm the headline</h1>
<p>i'm a paragraph</p>
<p>i'm a second paragraph</p>
</body>
<script>
var dynamicallyAddedElement = document.createElement("div");
dynamicallyAddedElement.className = "dynamic";
var textNode = document.createTextNode("I have been dynamically added via Javascript");
dynamicallyAddedElement.appendChild(textNode);
document.body.appendChild(dynamicallyAddedElement);
</script>
</html>
2 changes: 2 additions & 0 deletions settings.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include(
":fetcher:http-fetcher",
":fetcher:async-fetcher",
":fetcher:browser-fetcher",
":fetcher:chrome-fetcher",
":html-parser",
":integrationtests",
":ktor-extension",
Expand All @@ -18,6 +19,7 @@ include(

enableFeaturePreview("TYPESAFE_PROJECT_ACCESSORS")


apply(from = "./buildSrc/repositories.settings.gradle.kts")

@Suppress("UnstableApiUsage") // Central declaration of repositories is an incubating feature
Expand Down
Loading