For Crawling in Java, you'll require Jsoup lirary. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.
A simple example of Jsoup:
Add it to your project from pom:
Refer following URL : A simple example of Jsoup:
Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements newsHeadlines = doc.select("#mp-itn b a");
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
public getmoreURL {
Document doc = null;
String currentURL;
for (HashMap tmpMap: urlInfos) {
currentURL = tmpMap.get("url");
doc = Jsoup.connect(currentURL).get();
Elements articleLead = doc.select("article.lead-story");
if (currentURL.contains("xyz")) {
for (Element elem: articleLead) {
int size = elem.select("a").size();
Element anchor = elem.select("a").get(1);
String title = anchor.text();
String URL = anchor.attr("href");
if (title.length() == 0) title = elem.select("a").get(2).text();
if (!dbUtils.find("url", URL)) urlQueue.add(URL);
if (!alreadyInList(URL)) {
HashMap info = new HashMap();
info.put("url", URL);
urlInfos.add(info);
retrieveRelatedStory(URL);
} else System.out.println("Title - " + title + " already in the list.");
}
Elements articleElement = doc.select("article.teaser");
for (Element elem: articleElement) {
int size = elem.select("a").size();
Element anchor = elem.select("a").get(1);
String title = anchor.text();
String URL = anchor.attr("href");
if (title.length() == 0) title = elem.select("a").get(2).text();
if (!dbUtils.find("url", URL)) {
urlQueue.add(URL);
crawURLsFromHomePage(title, URL);
retrieveRelatedStory(URL);
} else System.out.println("Title - " + title + " already in the list.");
}
} else {
if (!alreadyInList(currentURL)) {
HashMap tempRelated = new HashMap();
tempRelated.put("url", currentURL);
urlInfos.add(tempRelated);
retrieveRelatedStory(currentURL);
}
retrieveRelatedStory(currentURL);
}
}
}
https://ksah.in/introduction-to-web-scraping-with-java/
http://jaunt-api.com/
No comments:
Post a Comment