|
||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectorg.apache.lenya.search.crawler.IterativeHTMLCrawler
public class IterativeHTMLCrawler
Crawl iteratively
Constructor Summary | |
---|---|
IterativeHTMLCrawler(java.io.File config)
Creates a new IterativeHTMLCrawler object. |
|
IterativeHTMLCrawler(java.lang.String url_list_file,
java.lang.String html_dump_directory,
java.lang.String userAgent)
Creates a new IterativeHTMLCrawler object. |
Method Summary | |
---|---|
java.net.URL |
addURL(java.lang.String urlCandidate,
java.lang.String currentURLPath)
Add URLs to crawl |
java.net.URL |
completeURL(java.net.URL parent,
java.lang.String child)
DOCUMENT ME! |
void |
crawl(java.net.URL start,
java.lang.String scope)
Crawl |
void |
dumpHTDoc(java.net.URL url)
DOCUMENT ME! |
boolean |
filterURL(java.lang.String url,
java.lang.String currentURLPath,
java.util.TreeSet links)
DOCUMENT ME! |
java.lang.String |
getExtension(java.io.File file)
DOCUMENT ME! |
java.lang.String |
getExtension(java.net.URL url)
DOCUMENT ME! |
static java.util.List |
handleHTML(java.net.HttpURLConnection httpCon)
DOCUMENT ME! |
void |
handlePDF(java.net.HttpURLConnection httpCon)
Parse PDF for links |
boolean |
inScope(java.lang.String url)
DOCUMENT ME! |
static void |
main(java.lang.String[] args)
Command line interface |
java.lang.String |
parseHREF(java.lang.String url,
java.lang.String urlLowCase,
java.lang.String currentURLPath)
Parse URL and complete if necessary |
java.util.List |
parsePage(java.lang.String urlString)
DOCUMENT ME! |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Constructor Detail |
---|
public IterativeHTMLCrawler(java.lang.String url_list_file, java.lang.String html_dump_directory, java.lang.String userAgent)
url_list_file
- File where all dumped files will be listedhtml_dump_directory
- Directory where htdocs should be dumpeduserAgent
- User-agent for robots.txtpublic IterativeHTMLCrawler(java.io.File config)
config
- Configuration FileMethod Detail |
---|
public static void main(java.lang.String[] args)
args
- Configuration file crawler.xconfpublic void crawl(java.net.URL start, java.lang.String scope)
start
- Start crawling at this URLscope
- Limit crawling to this scopepublic java.net.URL addURL(java.lang.String urlCandidate, java.lang.String currentURLPath) throws java.net.MalformedURLException
urlCandidate
- DOCUMENT ME!currentURLPath
- DOCUMENT ME!
java.net.MalformedURLException
- DOCUMENT ME!public java.util.List parsePage(java.lang.String urlString)
urlString
- DOCUMENT ME!
public static java.util.List handleHTML(java.net.HttpURLConnection httpCon) throws java.io.IOException
httpCon
- DOCUMENT ME!
java.io.IOException
- DOCUMENT ME!public void handlePDF(java.net.HttpURLConnection httpCon)
httpCon
- DOCUMENT ME!public boolean filterURL(java.lang.String url, java.lang.String currentURLPath, java.util.TreeSet links)
url
- DOCUMENT ME!currentURLPath
- DOCUMENT ME!links
- DOCUMENT ME!
public java.lang.String parseHREF(java.lang.String url, java.lang.String urlLowCase, java.lang.String currentURLPath)
url
- URL from hrefurlLowCase
- url is lower casecurrentURLPath
- URL of current page
public boolean inScope(java.lang.String url)
url
- DOCUMENT ME!
public java.net.URL completeURL(java.net.URL parent, java.lang.String child) throws java.net.MalformedURLException
parent
- DOCUMENT ME!child
- DOCUMENT ME!
java.net.MalformedURLException
- DOCUMENT ME!public void dumpHTDoc(java.net.URL url)
url
- DOCUMENT ME!public java.lang.String getExtension(java.net.URL url)
url
- DOCUMENT ME!
public java.lang.String getExtension(java.io.File file)
file
- DOCUMENT ME!
|
||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |