From: Erik Brakkee Date: Fri, 17 Mar 2006 23:22:35 +0000 (+0000) Subject: checkstyleZZ X-Git-Tag: wamblee-utils-0.7~1115 X-Git-Url: http://wamblee.org/gitweb/?a=commitdiff_plain;h=5abc820d10495d559cac9aede0a62521659bced4;p=utils checkstyleZZ --- diff --git a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java index 144abe78..dd9e8ae7 100644 --- a/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java +++ b/crawler/basic/src/org/wamblee/crawler/AbstractPageRequest.java @@ -34,7 +34,6 @@ import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; -import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java index e5dac7d0..c3663735 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ActionImpl.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; @@ -24,53 +24,93 @@ import org.wamblee.crawler.PageException; import org.wamblee.crawler.PageType; /** - * + * Action implementation. */ public class ActionImpl implements Action { - - private Crawler _crawler; - private Element _content; - private String _name; - private String _reference; - private PageType _type; - - public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference) { - _crawler = aCrawler; - _content = aContent; + + private Crawler _crawler; + + private Element _content; + + private String _name; + + private String _reference; + + private PageType _type; + + /** + * Constructs the action. + * + * @param aCrawler + * Crawler to use. + * @param aContent + * Content of the action element in the page where the action + * occurs. + * @param aName + * Name of the action. + * @param aReference + * URL of the reference. + */ + public ActionImpl(Crawler aCrawler, Element aContent, String aName, + String aReference) { + _crawler = aCrawler; + _content = aContent; _name = aName; _reference = aReference; - _type = null; + _type = null; } - - public ActionImpl(Crawler aCrawler, Element aContent, String aName, String aReference, PageType aType) { + + /** + * Constructs the action. + * + * @param aCrawler + * Crawler to use. + * @param aContent + * Content of the action element in the page where the action + * occurs. + * @param aName + * Name of the action. + * @param aReference + * URL of the reference. + * @param aType + * Type of the referenced page. + */ + public ActionImpl(Crawler aCrawler, Element aContent, String aName, + String aReference, PageType aType) { _crawler = aCrawler; - _content = aContent; + _content = aContent; _name = aName; - _reference = aReference; - _type = aType; + _reference = aReference; + _type = aType; } - /* (non-Javadoc) + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Action#getName() */ public String getName() { return _name; } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Action#execute() */ public Page execute() throws PageException { - if ( _type == null) { + if (_type == null) { return _crawler.getPage(_reference); } return _crawler.getPage(_reference, _type); } - - /* (non-Javadoc) + + /* + * (non-Javadoc) + * * @see org.wamblee.crawler.Action#getContent() */ public Element getContent() { - return _content; + return _content; } } diff --git a/crawler/basic/src/org/wamblee/crawler/impl/App.java b/crawler/basic/src/org/wamblee/crawler/impl/App.java index 65c7f802..6246453a 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/App.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/App.java @@ -49,6 +49,12 @@ public final class App { private static final String LOG_FILE = "crawler.log"; + /** + * Runs a test program. + * @param aArgs Arguments. First argument is the crawler config file name and second argument is + * the start url. + * @throws Exception In case of problems. + */ public static void main(String[] aArgs) throws Exception { String configFileName = aArgs[0]; String starturl = aArgs[1]; diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java index d6c73859..6349d2c1 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigItem.java @@ -19,7 +19,7 @@ package org.wamblee.crawler.impl; import java.util.regex.Pattern; /** - * + * Configuration item for obtaining an object in case a pattern matches. */ class ConfigItem { @@ -27,11 +27,21 @@ class ConfigItem { private ValueType _value; + /** + * Constructs the item. + * @param aPattern Pattern. + * @param aValue Value. + */ protected ConfigItem(String aPattern, ValueType aValue) { _pattern = Pattern.compile(aPattern); _value = aValue; } + /** + * Returns the object in case the value matches. + * @param aValue Value to match. + * @return Object in case there is a match, null otherwise. + */ protected ValueType match(String aValue) { if (!_pattern.matcher(aValue).matches()) { return null; diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java index 4a64c6e5..2b2680b5 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationImpl.java @@ -31,6 +31,11 @@ public class ConfigurationImpl implements Configuration { private List _pageTypeConfig; + /** + * Constructs the configuration. + * @param aUrlConfig List of URL configuration elements. + * @param aPageTypeConfig List of page type configuration elements. + */ public ConfigurationImpl(List aUrlConfig, List aPageTypeConfig) { _urlConfig = aUrlConfig; diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index 2f2e5f5b..e9dc4013 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -62,11 +62,21 @@ public class ConfigurationParser { private static final int MAX_DELAY = 5000; private PrintStream _os; - + + /** + * Constructs the configuration parser. + * @param aOs The stream for logging requests. + * TODO plain java logging should be used instead of this awkward mechanism. + */ public ConfigurationParser(PrintStream aOs) { _os = aOs; } + /** + * Parses the configuration from an input stream. + * @param aStream Input file. + * @return Configuration. + */ public Configuration parse(InputStream aStream) { try { SAXReader reader = new SAXReader(); @@ -82,8 +92,9 @@ public class ConfigurationParser { } /** - * @param aRoot - * @return + * Parses the URL-based configuration. + * @param aRoot Root of the configuration file document. + * @return List of URL-based configurations. */ private List parseUrlConfigs(Element aRoot) { List configs = new ArrayList(); @@ -95,6 +106,11 @@ public class ConfigurationParser { return configs; } + /** + * Parses the page type based configurations. + * @param aRoot Root of the configuration file document. + * @return LIst of page type based configurations. + */ private List parsePageTypeConfigs(Element aRoot) { List configs = new ArrayList(); for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) { @@ -105,12 +121,22 @@ public class ConfigurationParser { return configs; } + /** + * Parses a URL-based configuration. + * @param aUrlElem Configuration element. + * @return Configuration. + */ private UrlConfig parseUrlConfig(Element aUrlElem) { String pattern = aUrlElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aUrlElem); return new UrlConfig(pattern, request); } + /** + * Parses a page type based configuration. + * @param aTypeElem Configuration element. + * @return Configuration. + */ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { String pattern = aTypeElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aTypeElem); @@ -118,14 +144,15 @@ public class ConfigurationParser { } /** - * @param aUrlElem - * @return + * Parses a request configuration describing how to execute requests. + * @param aElem Configuration element. + * @return Page request. */ - private PageRequest parseRequestConfig(Element aUrlElem) { - String method = aUrlElem.elementText(ELEM_METHOD); - String xslt = aUrlElem.elementText(ELEM_XSLT); + private PageRequest parseRequestConfig(Element aElem) { + String method = aElem.elementText(ELEM_METHOD); + String xslt = aElem.elementText(ELEM_XSLT); List params = new ArrayList(); - for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) { + for (Iterator i = aElem.elementIterator(ELEM_PARAM); i.hasNext();) { Element paramElem = (Element) i.next(); NameValuePair param = parseParameter(paramElem); params.add(param); @@ -147,6 +174,11 @@ public class ConfigurationParser { return request; } + /** + * Parses a parameter definition. + * @param aParam Parameter. + * @return Name value pair describing a parameter. + */ private NameValuePair parseParameter(Element aParam) { String name = aParam.attributeValue(AT_NAME); String value = aParam.attributeValue(AT_VALUE); diff --git a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java index 53a3873a..5633078c 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/CrawlerImpl.java @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ + */ package org.wamblee.crawler.impl; @@ -30,64 +30,80 @@ import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PageType; /** - * Crawler implementation. + * Crawler implementation. */ public class CrawlerImpl implements Crawler { - + private static final Log LOG = LogFactory.getLog(CrawlerImpl.class); - private static final int MAX_DELAY = 5000; - - private HttpClient _client; + + private static final int MAX_DELAY = 5000; + + private HttpClient _client; + private Configuration _config; - + + /** + * Constructs the crawler. + * + * @param aClient + * Http client to use. + * @param aConfig + * Configuration. + */ public CrawlerImpl(HttpClient aClient, Configuration aConfig) { - _client = aClient; - _config = aConfig; + _client = aClient; + _config = aConfig; } /* - * (non-Javadoc) + * (non-Javadoc) + * * @see org.wamblee.crawler.Crawler#getPage(java.lang.String) */ public Page getPage(String aUrl) throws PageException { LOG.info("Getting page: url = '" + aUrl + "'"); PageRequest request = _config.getRequest(aUrl); Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + return transformToDom4jDoc(content); } - - /* (non-Javadoc) - * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, java.lang.String) + + /* + * (non-Javadoc) + * + * @see org.wamblee.crawler.Crawler#getPage(java.lang.String, + * java.lang.String) */ public Page getPage(String aUrl, PageType aType) throws PageException { LOG.info("Getting page: url = '" + aUrl + "', type = '" + aType + "'"); PageRequest request = _config.getRequest(aType); Document content = request.execute(aUrl, _client); - return transformToDom4jDoc(content); + return transformToDom4jDoc(content); } - + /** - * @param aUrl - * @param request + * Converts a w3c DOM document to a page object. + * @param content DOM document. * @return */ private Page transformToDom4jDoc(Document content) { - DOMReader reader = new DOMReader(); org.dom4j.Document dom4jDoc = reader.read(content); Element root = dom4jDoc.getRootElement(); dom4jDoc.remove(root); - + return new PageImpl(this, replaceReferencesWithContent(root)); } - + /** - * Perform crawling. Find references in the retrieved content and replace them - * by the content they refer to by retrieving the appropriate pages as well. - * @param content Content which must be made complete. - * @return Fully processed content. + * Perform crawling. Find references in the retrieved content and replace + * them by the content they refer to by retrieving the appropriate pages as + * well. + * + * @param content + * Content which must be made complete. + * @return Fully processed content. */ - private Element replaceReferencesWithContent(Element content) { - return content; // TODO implement. + private Element replaceReferencesWithContent(Element content) { + return content; // TODO implement. } } diff --git a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java index e1e61d8d..d478ed4b 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/PageImpl.java @@ -114,7 +114,7 @@ public class PageImpl implements Page { return null; } if (results.size() > 1) { - throw new RuntimeException("Duplicate link '" + aName + "'"); + throw new RuntimeException("Duplicate action '" + aName + "'"); } return results.get(0); } diff --git a/crawler/basic/src/org/wamblee/crawler/impl/PageTypeConfig.java b/crawler/basic/src/org/wamblee/crawler/impl/PageTypeConfig.java index 08ef1605..b7fa5a3c 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/PageTypeConfig.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/PageTypeConfig.java @@ -19,14 +19,24 @@ package org.wamblee.crawler.impl; import org.wamblee.crawler.PageRequest; /** - * + * Page type configuration. */ public class PageTypeConfig extends ConfigItem { + /** + * Constructs the configuration. + * @param aPattern Page type pattern. + * @param aRequest Page request. + */ public PageTypeConfig(String aPattern, PageRequest aRequest) { super(aPattern, aRequest); } + /** + * Returns the request in case the type matches. + * @param aType Page type. + * @return Request if the type matches, null otherwise. + */ public PageRequest getRequest(String aType) { return match(aType); }