Skip to content

Commit

Permalink
able to extract urls
Browse files Browse the repository at this point in the history
  • Loading branch information
zick2 committed Jun 18, 2020
1 parent 31dee13 commit 9e2046c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 9 deletions.
36 changes: 27 additions & 9 deletions src/main/java/Main.java
Original file line number Diff line number Diff line change
@@ -1,20 +1,38 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.IOException;

public class Main {
public static void main(String[] args){
System.out.println("Hello ... ");
System.out.println("Crawler ... ");

/**
* Websites for Testing:
* 1) http://www.sciencefix.com/
* 2) http://kauaimark.blogspot.com/
* 3) https://www.miss-thrifty.co.uk
* */
String start_url = "http://www.sciencefix.com/";

try{
//Fetching and parsing HTMl file ...
Document doc = Jsoup.connect(start_url).get();
//Extracting all <a> tags
Elements links = doc.select("a[href]");
//For each tag extract the href attribute (it contains the urls we need)
for(Element url : links){
System.out.println("url : "+url.attr("abs:href"));
}

//Getting some simple HTML ...
String html = "<html><head><title>First parse</title></head>"
+ "<body><p id=\"txt\">Parsed HTML into a doc.</p></body></html>";
//Parsing HTML using Jsoup ...
Document doc = Jsoup.parse(html);
}catch (IOException IO_error){
System.out.println("ERROR:");
IO_error.printStackTrace();
}

//Extracting an element using Id ...
Element elem = doc.getElementById("txt");
System.out.print("Extracted element: \n"+ elem);
}


Expand Down
Binary file modified target/classes/Main.class
Binary file not shown.
Binary file added target/classes/scrape.class
Binary file not shown.

0 comments on commit 9e2046c

Please sign in to comment.