| 
||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||
java.lang.Objectorg.apache.lenya.search.crawler.IterativeHTMLCrawler
public class IterativeHTMLCrawler
Crawl iteratively
| Constructor Summary | |
|---|---|
IterativeHTMLCrawler(java.io.File config)
Creates a new IterativeHTMLCrawler object.  | 
|
IterativeHTMLCrawler(java.lang.String url_list_file,
                     java.lang.String html_dump_directory,
                     java.lang.String userAgent)
Creates a new IterativeHTMLCrawler object.  | 
|
| Method Summary | |
|---|---|
 java.net.URL | 
addURL(java.lang.String urlCandidate,
       java.lang.String currentURLPath)
Add URLs to crawl  | 
 java.net.URL | 
completeURL(java.net.URL parent,
            java.lang.String child)
DOCUMENT ME!  | 
 void | 
crawl(java.net.URL start,
      java.lang.String scope)
Crawl  | 
 void | 
dumpHTDoc(java.net.URL url)
DOCUMENT ME!  | 
 boolean | 
filterURL(java.lang.String url,
          java.lang.String currentURLPath,
          java.util.TreeSet links)
DOCUMENT ME!  | 
 java.lang.String | 
getExtension(java.io.File file)
DOCUMENT ME!  | 
 java.lang.String | 
getExtension(java.net.URL url)
DOCUMENT ME!  | 
static java.util.List | 
handleHTML(java.net.HttpURLConnection httpCon)
DOCUMENT ME!  | 
 void | 
handlePDF(java.net.HttpURLConnection httpCon)
Parse PDF for links  | 
 boolean | 
inScope(java.lang.String url)
DOCUMENT ME!  | 
static void | 
main(java.lang.String[] args)
Command line interface  | 
 java.lang.String | 
parseHREF(java.lang.String url,
          java.lang.String urlLowCase,
          java.lang.String currentURLPath)
Parse URL and complete if necessary  | 
 java.util.List | 
parsePage(java.lang.String urlString)
DOCUMENT ME!  | 
| Methods inherited from class java.lang.Object | 
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait | 
| Constructor Detail | 
|---|
public IterativeHTMLCrawler(java.lang.String url_list_file,
                            java.lang.String html_dump_directory,
                            java.lang.String userAgent)
url_list_file - File where all dumped files will be listedhtml_dump_directory - Directory where htdocs should be dumpeduserAgent - User-agent for robots.txtpublic IterativeHTMLCrawler(java.io.File config)
config - Configuration File| Method Detail | 
|---|
public static void main(java.lang.String[] args)
args - Configuration file crawler.xconf
public void crawl(java.net.URL start,
                  java.lang.String scope)
start - Start crawling at this URLscope - Limit crawling to this scope
public java.net.URL addURL(java.lang.String urlCandidate,
                           java.lang.String currentURLPath)
                    throws java.net.MalformedURLException
urlCandidate - DOCUMENT ME!currentURLPath - DOCUMENT ME!
java.net.MalformedURLException - DOCUMENT ME!public java.util.List parsePage(java.lang.String urlString)
urlString - DOCUMENT ME!
public static java.util.List handleHTML(java.net.HttpURLConnection httpCon)
                                 throws java.io.IOException
httpCon - DOCUMENT ME!
java.io.IOException - DOCUMENT ME!public void handlePDF(java.net.HttpURLConnection httpCon)
httpCon - DOCUMENT ME!
public boolean filterURL(java.lang.String url,
                         java.lang.String currentURLPath,
                         java.util.TreeSet links)
url - DOCUMENT ME!currentURLPath - DOCUMENT ME!links - DOCUMENT ME!
public java.lang.String parseHREF(java.lang.String url,
                                  java.lang.String urlLowCase,
                                  java.lang.String currentURLPath)
url - URL from hrefurlLowCase - url is lower casecurrentURLPath - URL of current page
public boolean inScope(java.lang.String url)
url - DOCUMENT ME!
public java.net.URL completeURL(java.net.URL parent,
                                java.lang.String child)
                         throws java.net.MalformedURLException
parent - DOCUMENT ME!child - DOCUMENT ME!
java.net.MalformedURLException - DOCUMENT ME!public void dumpHTDoc(java.net.URL url)
url - DOCUMENT ME!public java.lang.String getExtension(java.net.URL url)
url - DOCUMENT ME!
public java.lang.String getExtension(java.io.File file)
file - DOCUMENT ME!
  | 
||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||