|
||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||
java.lang.Objectorg.apache.lenya.search.crawler.IterativeHTMLCrawler
public class IterativeHTMLCrawler
Crawl iteratively
| Constructor Summary | |
|---|---|
IterativeHTMLCrawler(java.io.File config)
Creates a new IterativeHTMLCrawler object. |
|
IterativeHTMLCrawler(java.lang.String url_list_file,
java.lang.String html_dump_directory,
java.lang.String userAgent)
Creates a new IterativeHTMLCrawler object. |
|
| Method Summary | |
|---|---|
java.net.URL |
addURL(java.lang.String urlCandidate,
java.lang.String currentURLPath)
Add URLs to crawl |
java.net.URL |
completeURL(java.net.URL parent,
java.lang.String child)
DOCUMENT ME! |
void |
crawl(java.net.URL start,
java.lang.String scope)
Crawl |
void |
dumpHTDoc(java.net.URL url)
DOCUMENT ME! |
boolean |
filterURL(java.lang.String url,
java.lang.String currentURLPath,
java.util.TreeSet links)
DOCUMENT ME! |
java.lang.String |
getExtension(java.io.File file)
DOCUMENT ME! |
java.lang.String |
getExtension(java.net.URL url)
DOCUMENT ME! |
static java.util.List |
handleHTML(java.net.HttpURLConnection httpCon)
DOCUMENT ME! |
void |
handlePDF(java.net.HttpURLConnection httpCon)
Parse PDF for links |
boolean |
inScope(java.lang.String url)
DOCUMENT ME! |
static void |
main(java.lang.String[] args)
Command line interface |
java.lang.String |
parseHREF(java.lang.String url,
java.lang.String urlLowCase,
java.lang.String currentURLPath)
Parse URL and complete if necessary |
java.util.List |
parsePage(java.lang.String urlString)
DOCUMENT ME! |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Constructor Detail |
|---|
public IterativeHTMLCrawler(java.lang.String url_list_file,
java.lang.String html_dump_directory,
java.lang.String userAgent)
url_list_file - File where all dumped files will be listedhtml_dump_directory - Directory where htdocs should be dumpeduserAgent - User-agent for robots.txtpublic IterativeHTMLCrawler(java.io.File config)
config - Configuration File| Method Detail |
|---|
public static void main(java.lang.String[] args)
args - Configuration file crawler.xconf
public void crawl(java.net.URL start,
java.lang.String scope)
start - Start crawling at this URLscope - Limit crawling to this scope
public java.net.URL addURL(java.lang.String urlCandidate,
java.lang.String currentURLPath)
throws java.net.MalformedURLException
urlCandidate - DOCUMENT ME!currentURLPath - DOCUMENT ME!
java.net.MalformedURLException - DOCUMENT ME!public java.util.List parsePage(java.lang.String urlString)
urlString - DOCUMENT ME!
public static java.util.List handleHTML(java.net.HttpURLConnection httpCon)
throws java.io.IOException
httpCon - DOCUMENT ME!
java.io.IOException - DOCUMENT ME!public void handlePDF(java.net.HttpURLConnection httpCon)
httpCon - DOCUMENT ME!
public boolean filterURL(java.lang.String url,
java.lang.String currentURLPath,
java.util.TreeSet links)
url - DOCUMENT ME!currentURLPath - DOCUMENT ME!links - DOCUMENT ME!
public java.lang.String parseHREF(java.lang.String url,
java.lang.String urlLowCase,
java.lang.String currentURLPath)
url - URL from hrefurlLowCase - url is lower casecurrentURLPath - URL of current page
public boolean inScope(java.lang.String url)
url - DOCUMENT ME!
public java.net.URL completeURL(java.net.URL parent,
java.lang.String child)
throws java.net.MalformedURLException
parent - DOCUMENT ME!child - DOCUMENT ME!
java.net.MalformedURLException - DOCUMENT ME!public void dumpHTDoc(java.net.URL url)
url - DOCUMENT ME!public java.lang.String getExtension(java.net.URL url)
url - DOCUMENT ME!
public java.lang.String getExtension(java.io.File file)
file - DOCUMENT ME!
|
||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||