X-Git-Url: http://wamblee.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crawler%2Fbasic%2Fsrc%2Forg%2Fwamblee%2Fcrawler%2Fimpl%2FConfigurationParser.java;h=7f1e9086106ece4d642959f8516dbdf077274fc9;hb=9dd28acaa4069b4ed857cdf968b8a2b623903b7b;hp=2f2e5f5bf95cdf9cc866c182551da757de820769;hpb=917321038aac9668051a64278525a2cc7bc5c2e2;p=utils diff --git a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java index 2f2e5f5b..7f1e9086 100644 --- a/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java +++ b/crawler/basic/src/org/wamblee/crawler/impl/ConfigurationParser.java @@ -17,7 +17,6 @@ package org.wamblee.crawler.impl; import java.io.InputStream; -import java.io.PrintStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -31,6 +30,7 @@ import org.wamblee.crawler.Configuration; import org.wamblee.crawler.GetPageRequest; import org.wamblee.crawler.PageRequest; import org.wamblee.crawler.PostPageRequest; +import org.wamblee.xml.XslTransformer; /** * Parsing of the configuration from an XML file. @@ -48,6 +48,8 @@ public class ConfigurationParser { private static final String ELEM_XSLT = "xslt"; private static final String ELEM_PARAM = "param"; + + private static final String ELEM_HEADER = "header"; private static final String AT_NAME = "name"; @@ -59,14 +61,22 @@ public class ConfigurationParser { private static final int MAX_TRIES = 3; - private static final int MAX_DELAY = 5000; + private static final int MAX_DELAY = 1000; + + private XslTransformer _transformer; - private PrintStream _os; - - public ConfigurationParser(PrintStream aOs) { - _os = aOs; + /** + * Constructs the configuration parser. + */ + public ConfigurationParser(XslTransformer aTransformer) { + _transformer = aTransformer; } + /** + * Parses the configuration from an input stream. + * @param aStream Input file. + * @return Configuration. + */ public Configuration parse(InputStream aStream) { try { SAXReader reader = new SAXReader(); @@ -82,8 +92,9 @@ public class ConfigurationParser { } /** - * @param aRoot - * @return + * Parses the URL-based configuration. + * @param aRoot Root of the configuration file document. + * @return List of URL-based configurations. */ private List parseUrlConfigs(Element aRoot) { List configs = new ArrayList(); @@ -95,6 +106,11 @@ public class ConfigurationParser { return configs; } + /** + * Parses the page type based configurations. + * @param aRoot Root of the configuration file document. + * @return LIst of page type based configurations. + */ private List parsePageTypeConfigs(Element aRoot) { List configs = new ArrayList(); for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) { @@ -105,12 +121,22 @@ public class ConfigurationParser { return configs; } + /** + * Parses a URL-based configuration. + * @param aUrlElem Configuration element. + * @return Configuration. + */ private UrlConfig parseUrlConfig(Element aUrlElem) { String pattern = aUrlElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aUrlElem); return new UrlConfig(pattern, request); } + /** + * Parses a page type based configuration. + * @param aTypeElem Configuration element. + * @return Configuration. + */ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) { String pattern = aTypeElem.elementText(ELEM_PATTERN); PageRequest request = parseRequestConfig(aTypeElem); @@ -118,27 +144,25 @@ public class ConfigurationParser { } /** - * @param aUrlElem - * @return + * Parses a request configuration describing how to execute requests. + * @param aElem Configuration element. + * @return Page request. */ - private PageRequest parseRequestConfig(Element aUrlElem) { - String method = aUrlElem.elementText(ELEM_METHOD); - String xslt = aUrlElem.elementText(ELEM_XSLT); - List params = new ArrayList(); - for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext();) { - Element paramElem = (Element) i.next(); - NameValuePair param = parseParameter(paramElem); - params.add(param); - } - + private PageRequest parseRequestConfig(Element aElem) { + String method = aElem.elementText(ELEM_METHOD); + String xslt = aElem.elementText(ELEM_XSLT); + List params = parseNameValuePairs(aElem, ELEM_PARAM); + List headers = parseNameValuePairs(aElem, ELEM_HEADER); + NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]); + NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]); PageRequest request; if (METHOD_POST.equals(method)) { - request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, - xslt, _os); + request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, + xslt, _transformer); } else if (METHOD_GET.equals(method) || method == null) { - request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, - xslt, _os); + request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray, + xslt, _transformer); } else { throw new RuntimeException("Unknown request method '" + method + "'. Only " + METHOD_GET + " and " + METHOD_POST @@ -147,6 +171,25 @@ public class ConfigurationParser { return request; } + /** + * @param aElem + * @return + */ + private List parseNameValuePairs(Element aElem, String aElemName) { + List headers = new ArrayList(); + for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) { + Element paramElem = (Element) i.next(); + NameValuePair header = parseParameter(paramElem); + headers.add(header); + } + return headers; + } + + /** + * Parses a parameter definition. + * @param aParam Parameter. + * @return Name value pair describing a parameter. + */ private NameValuePair parseParameter(Element aParam) { String name = aParam.attributeValue(AT_NAME); String value = aParam.attributeValue(AT_VALUE);