From 46ef0f5ebb968911248ea8bf8513a014e9bc4ee2 Mon Sep 17 00:00:00 2001 From: Gustavo Pinto Date: Thu, 23 May 2013 17:50:32 -0300 Subject: [PATCH] adding tests and improving code quality. rel #33 --- .../main/br/ufpe/cin/groundhog/Project.java | 21 +++----- .../cin/groundhog/crawler/CrawlGitHub.java | 9 ++-- .../groundhog/crawler/CrawlGoogleCode.java | 27 ++-------- .../groundhog/crawler/CrawlSourceForge.java | 38 ++------------ .../cin/groundhog/crawler/ForgeCrawler.java | 14 ++--- .../br/ufpe/cin/groundhog/main/CmdMain.java | 3 +- .../br/ufpe/cin/groundhog/main/TestMain.java | 3 -- .../groundhog/search/SearchGoogleCode.java | 48 +++++++---------- .../groundhog/search/SearchSourceForge.java | 20 ++++---- .../groundhog/crawler/CrawGoogleCodeTest.java | 51 +++++++++++++++++++ .../groundhog/crawler/CrawlGitHubTest.java | 1 - .../crawler/CrawlSourceForgeTest.java | 51 +++++++++++++++++++ .../search/SearchGoogleCodeTest.java | 1 + 13 files changed, 156 insertions(+), 131 deletions(-) create mode 100644 src/java/test/br/ufpe/cin/groundhog/crawler/CrawGoogleCodeTest.java create mode 100644 src/java/test/br/ufpe/cin/groundhog/crawler/CrawlSourceForgeTest.java diff --git a/src/java/main/br/ufpe/cin/groundhog/Project.java b/src/java/main/br/ufpe/cin/groundhog/Project.java index 25bcc7e..bf1c74a 100644 --- a/src/java/main/br/ufpe/cin/groundhog/Project.java +++ b/src/java/main/br/ufpe/cin/groundhog/Project.java @@ -1,9 +1,9 @@ package br.ufpe.cin.groundhog; -import java.text.ParseException; -import java.text.SimpleDateFormat; import java.util.Date; +import br.ufpe.cin.groundhog.util.Dates; + /** * Represents a software project in Groundhog * @author fjsj, gustavopinto, rodrigoalvesvieira @@ -330,14 +330,10 @@ public void setCreatedAt(Date createdAt) { } /** - * * @param createdAtParam the String correspondent to the creation date of the project in question. e.g: 2012-04-28T15:40:35Z - * @throws java.text.ParseException */ - public void setCreatedAt(String createdAtParam) throws java.text.ParseException { - SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - Date createAtDate = format.parse(createdAtParam.replace('T', ' ').replace("Z", "")); - + public void setCreatedAt(String createdAtParam) { + Date createAtDate = new Dates("yyyy-MM-dd HH:mm:ss").format(createdAtParam); this.createdAt = createAtDate; } @@ -359,16 +355,11 @@ public void setLastPushedAt(Date lastPushedAtParam) { } /** - * * @param lastPushedAtParam the String correspondent to the date of the last push to the project * in question. e.g: 2012-04-28T15:40:35Z - * @throws ParseException - * @throws java.text.ParseException */ - public void setLastPushedAt(String lastPushedAtParam) throws ParseException, java.text.ParseException { - SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - Date lastPushDate = format.parse(lastPushedAtParam.replace('T', ' ').replace("Z", "")); - + public void setLastPushedAt(String lastPushedAtParam){ + Date lastPushDate = new Dates("yyyy-MM-dd HH:mm:ss").format(lastPushedAtParam); this.lastPushedAt = lastPushDate; } diff --git a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGitHub.java b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGitHub.java index 8411389..7dbecd0 100644 --- a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGitHub.java +++ b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGitHub.java @@ -21,15 +21,16 @@ */ public class CrawlGitHub extends ForgeCrawler { - private final static Logger logger = LoggerFactory - .getLogger(CrawlGitHub.class); + private final static Logger logger = LoggerFactory.getLogger(CrawlGitHub.class); private final GitClient gitClient; + private final File destinationFolder; @Inject public CrawlGitHub(GitClient gitClient, File destinationFolder) { - super(destinationFolder); + super(); this.gitClient = gitClient; + this.destinationFolder = destinationFolder; } @Override @@ -41,7 +42,7 @@ protected File downloadProject(Project project) logger.info(String.format("Downloading %s project..", project.getName())); - gitClient.clone(cloneUrl, projectFolder); + this.gitClient.clone(cloneUrl, projectFolder); return projectFolder; } } \ No newline at end of file diff --git a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGoogleCode.java b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGoogleCode.java index 6091a70..2067ce1 100644 --- a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGoogleCode.java +++ b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlGoogleCode.java @@ -2,9 +2,6 @@ import java.io.File; import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.Future; import org.eclipse.jgit.api.errors.GitAPIException; import org.eclipse.jgit.api.errors.InvalidRemoteException; @@ -16,10 +13,6 @@ import br.ufpe.cin.groundhog.Project; import br.ufpe.cin.groundhog.SCM; import br.ufpe.cin.groundhog.scmclient.GitClient; -import br.ufpe.cin.groundhog.scmclient.ScmModule; - -import com.google.inject.Guice; -import com.google.inject.Injector; /** * A concrete class to crawl GitHub. @@ -30,10 +23,12 @@ public class CrawlGoogleCode extends ForgeCrawler { private static Logger logger = LoggerFactory.getLogger(CrawlGoogleCode.class); private final GitClient gitClient; + private final File destinationFolder; public CrawlGoogleCode(GitClient gitClient, File destinationFolder) { - super(destinationFolder); + super(); this.gitClient = gitClient; + this.destinationFolder = destinationFolder; } @Override @@ -71,20 +66,4 @@ protected File downloadProject(Project project) } return projectFolder; } - - public static void main(String[] args) throws Exception { - long time = System.nanoTime(); - List projects = Arrays.asList( - new Project("epubcheck", "")); - File dest = new File("C:\\Users\\fjsj\\Downloads\\EponaProjects\\"); - - Injector injector = Guice.createInjector(new ScmModule()); - GitClient gitClient = injector.getInstance(GitClient.class); - - CrawlGoogleCode crawl = new CrawlGoogleCode(gitClient, dest); - List> fs = crawl.downloadProjects(projects); - crawl.shutdown(); - for (Future f : fs) f.get(); - System.out.printf("Elapsed: %.2f", (System.nanoTime() - time) / 1000000000.0); - } } \ No newline at end of file diff --git a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlSourceForge.java b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlSourceForge.java index 43b5cc6..c103616 100644 --- a/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlSourceForge.java +++ b/src/java/main/br/ufpe/cin/groundhog/crawler/CrawlSourceForge.java @@ -9,7 +9,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Date; -import java.util.Enumeration; import java.util.List; import java.util.Stack; import java.util.Vector; @@ -24,12 +23,9 @@ import br.ufpe.cin.groundhog.Project; import br.ufpe.cin.groundhog.extractor.Formats; -import br.ufpe.cin.groundhog.http.HttpModule; import br.ufpe.cin.groundhog.http.Requests; -import com.google.inject.Guice; import com.google.inject.Inject; -import com.google.inject.Injector; import com.ning.http.client.AsyncCompletionHandler; import com.ning.http.client.ListenableFuture; import com.ning.http.client.Response; @@ -40,13 +36,15 @@ public class CrawlSourceForge extends ForgeCrawler { private ConcurrentHashMap mapModifiedDate; private SimpleDateFormat dateFormat; private Requests requests; + private File destinationFolder; @Inject public CrawlSourceForge(Requests requests, File destinationFolder) { - super(destinationFolder); + super(); this.mapModifiedDate = new ConcurrentHashMap(); this.dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z"); this.requests = requests; + this.destinationFolder = destinationFolder; } private void parseURLsFromPage(String project, String html, @@ -190,34 +188,4 @@ protected File downloadProject(Project project) throws IOException, } return new File(destinationFolder, projectName); } - - private void setProjectsDirectoriesDates() { - Enumeration e = mapModifiedDate.keys(); - for (String relativePath = e.nextElement(); e.hasMoreElements(); relativePath = e - .nextElement()) { - Date modifiedDate = mapModifiedDate.get(relativePath); - File f = new File(destinationFolder, relativePath); - if (f.exists()) { - f.setLastModified(modifiedDate.getTime()); - } - } - } - - public static void main(String[] args) throws Exception { - long time = System.nanoTime(); - List projects = Arrays.asList(new Project("geom-java", ""), - new Project("im4java", "")); - File dest = new File("C:\\Users\\fjsj\\Downloads\\EponaProjects\\"); - Injector injector = Guice.createInjector(new HttpModule()); - Requests requests = injector.getInstance(Requests.class); - - CrawlSourceForge crawl = new CrawlSourceForge(requests, dest); - List> fs = crawl.downloadProjects(projects); - crawl.shutdown(); - for (Future f : fs) - f.get(); - crawl.setProjectsDirectoriesDates(); - System.out.printf("Elapsed: %.2f", - (System.nanoTime() - time) / 1000000000.0); - } } \ No newline at end of file diff --git a/src/java/main/br/ufpe/cin/groundhog/crawler/ForgeCrawler.java b/src/java/main/br/ufpe/cin/groundhog/crawler/ForgeCrawler.java index dd4c006..2b64e3b 100644 --- a/src/java/main/br/ufpe/cin/groundhog/crawler/ForgeCrawler.java +++ b/src/java/main/br/ufpe/cin/groundhog/crawler/ForgeCrawler.java @@ -19,17 +19,14 @@ * */ public abstract class ForgeCrawler { + private ExecutorService ex; - protected File destinationFolder; /** * Constructs a new ForgeCrawler with a given destinationFolder. - * - * @param destinationFolder folder into which projects will be downloaded */ - protected ForgeCrawler(File destinationFolder) { - ex = Executors.newFixedThreadPool(JsonInput.getMaxThreads()); - this.destinationFolder = destinationFolder; + protected ForgeCrawler() { + this.ex = Executors.newFixedThreadPool(JsonInput.getMaxThreads()); } /** @@ -63,6 +60,9 @@ public File call() throws Exception { }); fs.add(f); } + + shutdown(); + return fs; } @@ -70,7 +70,7 @@ public File call() throws Exception { * Guarantees downloads to be executed, but no new downloads will be accepted. * Should be called after downloadProjects. */ - public void shutdown() { + private void shutdown() { ex.shutdownNow(); } diff --git a/src/java/main/br/ufpe/cin/groundhog/main/CmdMain.java b/src/java/main/br/ufpe/cin/groundhog/main/CmdMain.java index f00381f..79225f3 100644 --- a/src/java/main/br/ufpe/cin/groundhog/main/CmdMain.java +++ b/src/java/main/br/ufpe/cin/groundhog/main/CmdMain.java @@ -287,9 +287,8 @@ public void run() { } })); } - crawler.shutdown(); + ex.shutdown(); - for (int i = 0; i < analysisFutures.size(); i++) { try { analysisFutures.get(i).get(); diff --git a/src/java/main/br/ufpe/cin/groundhog/main/TestMain.java b/src/java/main/br/ufpe/cin/groundhog/main/TestMain.java index f8c4497..3876555 100644 --- a/src/java/main/br/ufpe/cin/groundhog/main/TestMain.java +++ b/src/java/main/br/ufpe/cin/groundhog/main/TestMain.java @@ -62,7 +62,6 @@ public static void gitHubExample(String term) throws Exception { logger.info("2 - Download 1st result..."); ForgeCrawler crawler = new CrawlGitHub(injector.getInstance(GitClient.class), downloadFolder); List> futures = crawler.downloadProjects(projects); - crawler.shutdown(); File repositoryFolder = null; for (Future f : futures) { // wait for download repositoryFolder = f.get(); @@ -111,7 +110,6 @@ public static void sourceForgeExample() throws Exception { ForgeCrawler crawler = new CrawlSourceForge(requests, downloadFolder); List> futures = crawler.downloadProjects(projects); - crawler.shutdown(); File repositoryFolder = null; for (Future f : futures) { // wait for download repositoryFolder = f.get(); @@ -153,7 +151,6 @@ public static void googleCodeExample(String term) throws Exception { logger.info("2 - Download 1st result..."); ForgeCrawler crawler = new CrawlGoogleCode(injector.getInstance(GitClient.class), downloadFolder); List> futures = crawler.downloadProjects(projects); - crawler.shutdown(); File repositoryFolder = null; for (Future f : futures) { // wait for download repositoryFolder = f.get(); diff --git a/src/java/main/br/ufpe/cin/groundhog/search/SearchGoogleCode.java b/src/java/main/br/ufpe/cin/groundhog/search/SearchGoogleCode.java index 9d98be0..a5b4b18 100644 --- a/src/java/main/br/ufpe/cin/groundhog/search/SearchGoogleCode.java +++ b/src/java/main/br/ufpe/cin/groundhog/search/SearchGoogleCode.java @@ -1,6 +1,5 @@ package br.ufpe.cin.groundhog.search; -import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Future; @@ -26,6 +25,7 @@ * */ public class SearchGoogleCode implements ForgeSearch { + private static String root = "http://code.google.com"; private final Requests requests; @@ -38,17 +38,12 @@ public SearchGoogleCode(Requests requests) { * Fetches and returns the checkout command String for the project * @param html the HTML content of the page to be parsed * @return the checkout command within the given HTML page - * @throws IOException */ - private String parseCheckoutCommand(String html) throws IOException { + private String parseCheckoutCommand(String html) { Document doc = Jsoup.parse(html); Elements es = doc.select("#checkoutcmd"); - if (es.isEmpty()) { - return ""; - } else { - return es.first().text(); - } + return es.isEmpty() ? "" : es.first().text(); } /** @@ -57,18 +52,15 @@ private String parseCheckoutCommand(String html) throws IOException { * @param project the project to which the checkout must be applied */ private void setCheckoutCommandToProject(String command, Project project) { + String url = command.split(" ")[2]; + project.setScmURL(url); + if (command.startsWith("svn")) { - String url = command.split(" ")[2]; project.setSCM(SCM.SVN); - project.setScmURL(url); } else if (command.startsWith("git")) { - String url = command.split(" ")[2]; project.setSCM(SCM.GIT); - project.setScmURL(url); } else if (command.startsWith("hg")) { - String url = command.split(" ")[2]; project.setSCM(SCM.HG); - project.setScmURL(url); } else if (command.equals("")) { project.setSCM(SCM.NONE); } else { @@ -79,13 +71,12 @@ private void setCheckoutCommandToProject(String command, Project project) { public List getProjects(String term, int page) throws SearchException { try { List projects = new ArrayList(); - String paramsStr = - new ParamBuilder(). - add("q", term + " label:Java"). - add("start", String.valueOf((page - 1) * 10)). - build(); + String params = new ParamBuilder() + .add("q", term + " label:Java") + .add("start", String.valueOf((page - 1) * 10)) + .build(); - Document doc = Jsoup.parse(requests.get(root + "/hosting/search?" + paramsStr)); + Document doc = Jsoup.parse(requests.get(root + "/hosting/search?" + params)); for (Element tr : doc.select("#serp table tbody tr")) { Element el = tr.child(0).child(0); @@ -93,22 +84,19 @@ public List getProjects(String term, int page) throws SearchException { // of people watching the project on Google Code Element span = tr.child(1).child(2).child(0); - String projectName, description, imgSrc, iconURL, sourceCodeUrl; - int stars; - - projectName = el.attr("href").split("/")[2]; - description = tr.child(1).ownText(); - imgSrc = el.child(0).attr("src"); - iconURL = imgSrc; - stars = Integer.parseInt(span.text()); + String projectName = el.attr("href").split("/")[2]; + String description = tr.child(1).ownText(); + String iconURL = el.child(0).attr("src"); - if (imgSrc.startsWith("/")) { + if (iconURL.startsWith("/")) { iconURL = root + iconURL; } - sourceCodeUrl = "https://code.google.com/p/" + projectName + "/source/browse/"; + String sourceCodeUrl = "https://code.google.com/p/" + projectName + "/source/browse/"; Project forgeProject = new Project(projectName, description, iconURL, sourceCodeUrl); + + int stars = Integer.parseInt(span.text()); forgeProject.setWatchersCount(stars); forgeProject.setFollowersCount(stars); projects.add(forgeProject); diff --git a/src/java/main/br/ufpe/cin/groundhog/search/SearchSourceForge.java b/src/java/main/br/ufpe/cin/groundhog/search/SearchSourceForge.java index de1b696..38a91a9 100644 --- a/src/java/main/br/ufpe/cin/groundhog/search/SearchSourceForge.java +++ b/src/java/main/br/ufpe/cin/groundhog/search/SearchSourceForge.java @@ -21,6 +21,7 @@ * */ public class SearchSourceForge implements ForgeSearch { + private final Requests requests; @Inject @@ -30,16 +31,15 @@ public SearchSourceForge(Requests requests) { public List getProjects(String term, int page) throws SearchException { try { - List projects = new ArrayList(); - String paramsStr = - new ParamBuilder(). - add("q", term). - add("sort", "popular"). - add("page", String.valueOf(page)). - build(); + String paramsStr = new ParamBuilder() + .add("q", term) + .add("sort", "popular") + .add("page", String.valueOf(page)) + .build(); Document doc = Jsoup.parse(requests.get("http://sourceforge.net/directory/language:java/?" + paramsStr)); + List projects = new ArrayList(); for (Element li: doc.select(".projects > li")) { Element a = li.select("[itemprop=url]").first(); @@ -48,20 +48,20 @@ public List getProjects(String term, int page) throws SearchException { projectName = a.attr("href").split("/")[2]; description = li.select("[itemprop=description]").first().text(); - iconURL = li.select("[itemprop=image]").first().attr("src"); + iconURL = li.select("[itemprop=image]").first().attr("src"); if (iconURL.startsWith("//")) { iconURL = "http:" + iconURL; } projectURL = String.format("http://sourceforge.net/projects/%s/files/", projectName); - Project forgeProject = new Project(projectName, description, iconURL, SCM.SOURCE_FORGE, projectURL); - projects.add(forgeProject); + projects.add(new Project(projectName, description, iconURL, SCM.SOURCE_FORGE, projectURL)); } } return projects; } catch (IOException e) { + e.printStackTrace(); throw new SearchException(e); } } diff --git a/src/java/test/br/ufpe/cin/groundhog/crawler/CrawGoogleCodeTest.java b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawGoogleCodeTest.java new file mode 100644 index 0000000..7e6c350 --- /dev/null +++ b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawGoogleCodeTest.java @@ -0,0 +1,51 @@ +package br.ufpe.cin.groundhog.crawler; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.Future; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import br.ufpe.cin.groundhog.Project; +import br.ufpe.cin.groundhog.scmclient.GitClient; +import br.ufpe.cin.groundhog.scmclient.ScmModule; +import br.ufpe.cin.groundhog.search.SearchGoogleCode; +import br.ufpe.cin.groundhog.search.SearchModule; + +import com.google.common.io.Files; +import com.google.inject.Guice; +import com.google.inject.Injector; + +public class CrawGoogleCodeTest { + + private SearchGoogleCode searchGoogleCode; + private GitClient gitClient; + + @Before + public void setup() { + Injector injector = Guice.createInjector(new SearchModule(), new ScmModule()); + searchGoogleCode = injector.getInstance(SearchGoogleCode.class); + gitClient = injector.getInstance(GitClient.class); + } + + @Test + public void testCrawlGithub() { + try { + Project project = searchGoogleCode.getProjects("java", 1).get(0); + List projects = Arrays.asList(project); + + CrawlGoogleCode crawl = new CrawlGoogleCode(gitClient, Files.createTempDir()); + List> fs = crawl.downloadProjects(projects); + for (Future f : fs) { + File file = f.get(); + Assert.assertNotNull(file); + } + + } catch (Exception e) { + Assert.fail(); + } + } +} diff --git a/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlGitHubTest.java b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlGitHubTest.java index c216eae..b1591f8 100644 --- a/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlGitHubTest.java +++ b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlGitHubTest.java @@ -40,7 +40,6 @@ public void testCrawlGithub() { List projects = Arrays.asList(project); CrawlGitHub crawl = new CrawlGitHub(gitClient, Files.createTempDir()); List> fs = crawl.downloadProjects(projects); - crawl.shutdown(); for (Future f : fs) { File file = f.get(); Assert.assertNotNull(file); diff --git a/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlSourceForgeTest.java b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlSourceForgeTest.java new file mode 100644 index 0000000..07aa531 --- /dev/null +++ b/src/java/test/br/ufpe/cin/groundhog/crawler/CrawlSourceForgeTest.java @@ -0,0 +1,51 @@ +package br.ufpe.cin.groundhog.crawler; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.Future; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import br.ufpe.cin.groundhog.Project; +import br.ufpe.cin.groundhog.http.HttpModule; +import br.ufpe.cin.groundhog.http.Requests; +import br.ufpe.cin.groundhog.search.SearchModule; +import br.ufpe.cin.groundhog.search.SearchSourceForge; + +import com.google.common.io.Files; +import com.google.inject.Guice; +import com.google.inject.Injector; + +public class CrawlSourceForgeTest { + + private SearchSourceForge searchSourceForge; + private Requests requests; + + @Before + public void setup() { + Injector injector = Guice.createInjector(new SearchModule(), new HttpModule()); + searchSourceForge = injector.getInstance(SearchSourceForge.class); + requests = injector.getInstance(Requests.class); + } + + @Test + public void testCrawlGithub() { + try { + Project project = searchSourceForge.getProjects("geom-java", 1).get(0); + List projects = Arrays.asList(project); + + CrawlSourceForge crawl = new CrawlSourceForge(requests, Files.createTempDir()); + List> fs = crawl.downloadProjects(projects); + for (Future f : fs) { + File file = f.get(); + Assert.assertNotNull(file); + } + + } catch (Exception e) { + Assert.fail(); + } + } +} \ No newline at end of file diff --git a/src/java/test/br/ufpe/cin/groundhog/search/SearchGoogleCodeTest.java b/src/java/test/br/ufpe/cin/groundhog/search/SearchGoogleCodeTest.java index 617ccc8..b3e9b0b 100644 --- a/src/java/test/br/ufpe/cin/groundhog/search/SearchGoogleCodeTest.java +++ b/src/java/test/br/ufpe/cin/groundhog/search/SearchGoogleCodeTest.java @@ -24,6 +24,7 @@ public void setup() { @Test public void testSimpleSearch() { List projects = searchGoogleCode.getProjects("java", 1); + System.out.println(projects); Assert.assertNotNull(projects); } }