able to extract urls

zick2 · Jun 18, 2020 · 9e2046c · 9e2046c
1 parent 31dee13
commit 9e2046c
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 9 deletions.
diff --git a/src/main/java/Main.java b/src/main/java/Main.java
@@ -1,20 +1,38 @@
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.Elements;
+
+import java.io.IOException;
 
 public class Main {
     public  static void main(String[] args){
-        System.out.println("Hello ... ");
+        System.out.println("Crawler ... ");
+
+        /**
+         * Websites for Testing:
+         * 1) http://www.sciencefix.com/
+         * 2) http://kauaimark.blogspot.com/
+         * 3) https://www.miss-thrifty.co.uk
+         * */
+        String start_url = "http://www.sciencefix.com/";
+
+        try{
+            //Fetching and parsing HTMl file ...
+            Document doc = Jsoup.connect(start_url).get();
+            //Extracting all <a> tags
+            Elements links = doc.select("a[href]");
+            //For each tag extract the href attribute (it contains the urls we need)
+            for(Element url : links){
+                System.out.println("url : "+url.attr("abs:href"));
+            }
 
-        //Getting some simple HTML ...
-        String html = "<html><head><title>First parse</title></head>"
-                + "<body><p id=\"txt\">Parsed HTML into a doc.</p></body></html>";
-        //Parsing HTML using Jsoup ...
-        Document doc = Jsoup.parse(html);
+        }catch (IOException IO_error){
+            System.out.println("ERROR:");
+            IO_error.printStackTrace();
+        }
 
-        //Extracting an element using Id ...
-         Element elem = doc.getElementById("txt");
-        System.out.print("Extracted element: \n"+ elem);
     }
 
 

diff --git a/target/classes/Main.class b/target/classes/Main.class
diff --git a/target/classes/scrape.class b/target/classes/scrape.class