org.apache.lenya.search.crawler
Class IterativeHTMLCrawler

java.lang.Object
  extended by org.apache.lenya.search.crawler.IterativeHTMLCrawler

public class IterativeHTMLCrawler
extends java.lang.Object

Crawl iteratively


Constructor Summary
IterativeHTMLCrawler(java.io.File config)
          Creates a new IterativeHTMLCrawler object.
IterativeHTMLCrawler(java.lang.String url_list_file, java.lang.String html_dump_directory, java.lang.String userAgent)
          Creates a new IterativeHTMLCrawler object.
 
Method Summary
 java.net.URL addURL(java.lang.String urlCandidate, java.lang.String currentURLPath)
          Add URLs to crawl
 java.net.URL completeURL(java.net.URL parent, java.lang.String child)
          DOCUMENT ME!
 void crawl(java.net.URL start, java.lang.String scope)
          Crawl
 void dumpHTDoc(java.net.URL url)
          DOCUMENT ME!
 boolean filterURL(java.lang.String url, java.lang.String currentURLPath, java.util.TreeSet links)
          DOCUMENT ME!
 java.lang.String getExtension(java.io.File file)
          DOCUMENT ME!
 java.lang.String getExtension(java.net.URL url)
          DOCUMENT ME!
static java.util.List handleHTML(java.net.HttpURLConnection httpCon)
          DOCUMENT ME!
 void handlePDF(java.net.HttpURLConnection httpCon)
          Parse PDF for links
 boolean inScope(java.lang.String url)
          DOCUMENT ME!
static void main(java.lang.String[] args)
          Command line interface
 java.lang.String parseHREF(java.lang.String url, java.lang.String urlLowCase, java.lang.String currentURLPath)
          Parse URL and complete if necessary
 java.util.List parsePage(java.lang.String urlString)
          DOCUMENT ME!
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Constructor Detail

IterativeHTMLCrawler

public IterativeHTMLCrawler(java.lang.String url_list_file,
                            java.lang.String html_dump_directory,
                            java.lang.String userAgent)
Creates a new IterativeHTMLCrawler object.

Parameters:
url_list_file - File where all dumped files will be listed
html_dump_directory - Directory where htdocs should be dumped
userAgent - User-agent for robots.txt

IterativeHTMLCrawler

public IterativeHTMLCrawler(java.io.File config)
Creates a new IterativeHTMLCrawler object.

Parameters:
config - Configuration File
Method Detail

main

public static void main(java.lang.String[] args)
Command line interface

Parameters:
args - Configuration file crawler.xconf

crawl

public void crawl(java.net.URL start,
                  java.lang.String scope)
Crawl

Parameters:
start - Start crawling at this URL
scope - Limit crawling to this scope

addURL

public java.net.URL addURL(java.lang.String urlCandidate,
                           java.lang.String currentURLPath)
                    throws java.net.MalformedURLException
Add URLs to crawl

Parameters:
urlCandidate - DOCUMENT ME!
currentURLPath - DOCUMENT ME!
Returns:
DOCUMENT ME!
Throws:
java.net.MalformedURLException - DOCUMENT ME!

parsePage

public java.util.List parsePage(java.lang.String urlString)
DOCUMENT ME!

Parameters:
urlString - DOCUMENT ME!
Returns:
ok, 404

handleHTML

public static java.util.List handleHTML(java.net.HttpURLConnection httpCon)
                                 throws java.io.IOException
DOCUMENT ME!

Parameters:
httpCon - DOCUMENT ME!
Returns:
DOCUMENT ME!
Throws:
java.io.IOException - DOCUMENT ME!

handlePDF

public void handlePDF(java.net.HttpURLConnection httpCon)
Parse PDF for links

Parameters:
httpCon - DOCUMENT ME!

filterURL

public boolean filterURL(java.lang.String url,
                         java.lang.String currentURLPath,
                         java.util.TreeSet links)
DOCUMENT ME!

Parameters:
url - DOCUMENT ME!
currentURLPath - DOCUMENT ME!
links - DOCUMENT ME!
Returns:
DOCUMENT ME!

parseHREF

public java.lang.String parseHREF(java.lang.String url,
                                  java.lang.String urlLowCase,
                                  java.lang.String currentURLPath)
Parse URL and complete if necessary

Parameters:
url - URL from href
urlLowCase - url is lower case
currentURLPath - URL of current page
Returns:
Completed URL

inScope

public boolean inScope(java.lang.String url)
DOCUMENT ME!

Parameters:
url - DOCUMENT ME!
Returns:
DOCUMENT ME!

completeURL

public java.net.URL completeURL(java.net.URL parent,
                                java.lang.String child)
                         throws java.net.MalformedURLException
DOCUMENT ME!

Parameters:
parent - DOCUMENT ME!
child - DOCUMENT ME!
Returns:
DOCUMENT ME!
Throws:
java.net.MalformedURLException - DOCUMENT ME!

dumpHTDoc

public void dumpHTDoc(java.net.URL url)
DOCUMENT ME!

Parameters:
url - DOCUMENT ME!

getExtension

public java.lang.String getExtension(java.net.URL url)
DOCUMENT ME!

Parameters:
url - DOCUMENT ME!
Returns:
DOCUMENT ME!

getExtension

public java.lang.String getExtension(java.io.File file)
DOCUMENT ME!

Parameters:
file - DOCUMENT ME!
Returns:
DOCUMENT ME!


Copyright © 1999-2005 Apache Software Foundation. All Rights Reserved.