package org.wamblee.crawler.impl;
import java.io.InputStream;
-import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.wamblee.crawler.GetPageRequest;
import org.wamblee.crawler.PageRequest;
import org.wamblee.crawler.PostPageRequest;
+import org.wamblee.xml.XslTransformer;
/**
* Parsing of the configuration from an XML file.
*/
public class ConfigurationParser {
-
- private static final String ELEM_URL = "url";
+
+ private static final String ELEM_URL = "url";
+
private static final String ELEM_TYPE = "type";
- private static final String ELEM_PATTERN = "pattern";
- private static final String ELEM_METHOD= "method";
- private static final String ELEM_XSLT = "xslt";
- private static final String ELEM_PARAM = "param";
+
+ private static final String ELEM_PATTERN = "pattern";
+
+ private static final String ELEM_METHOD = "method";
+
+ private static final String ELEM_XSLT = "xslt";
+
+ private static final String ELEM_PARAM = "param";
+
+ private static final String ELEM_HEADER = "header";
+
private static final String AT_NAME = "name";
+
private static final String AT_VALUE = "value";
-
+
private static final String METHOD_POST = "post";
+
private static final String METHOD_GET = "get";
+
+ private static final int MAX_TRIES = 3;
+
+ private static final int MAX_DELAY = 1000;
- private static final int MAX_TRIES = 3;
- private static final int MAX_DELAY = 5000;
-
- private PrintStream _os;
-
- public ConfigurationParser(PrintStream aOs) {
- _os = aOs;
+ private XslTransformer _transformer;
+
+ /**
+ * Constructs the configuration parser.
+ */
+ public ConfigurationParser(XslTransformer aTransformer) {
+ _transformer = aTransformer;
}
+ /**
+ * Parses the configuration from an input stream.
+ * @param aStream Input file.
+ * @return Configuration.
+ */
public Configuration parse(InputStream aStream) {
try {
SAXReader reader = new SAXReader();
Document document = reader.read(aStream);
-
- Element root = document.getRootElement();
+
+ Element root = document.getRootElement();
List<UrlConfig> urlConfigs = parseUrlConfigs(root);
- List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
+ List<PageTypeConfig> pageTypeConfigs = parsePageTypeConfigs(root);
return new ConfigurationImpl(urlConfigs, pageTypeConfigs);
} catch (DocumentException e) {
throw new RuntimeException("Problem parsing config file", e);
}
/**
- * @param root
- * @return
+ * Parses the URL-based configuration.
+ * @param aRoot Root of the configuration file document.
+ * @return List of URL-based configurations.
*/
- private List<UrlConfig> parseUrlConfigs(Element root) {
+ private List<UrlConfig> parseUrlConfigs(Element aRoot) {
List<UrlConfig> configs = new ArrayList<UrlConfig>();
- for (Iterator i = root.elementIterator(ELEM_URL); i.hasNext(); ) {
- Element url = (Element)i.next();
+ for (Iterator i = aRoot.elementIterator(ELEM_URL); i.hasNext();) {
+ Element url = (Element) i.next();
UrlConfig config = parseUrlConfig(url);
configs.add(config);
}
return configs;
}
-
- private List<PageTypeConfig> parsePageTypeConfigs(Element root) {
+
+ /**
+ * Parses the page type based configurations.
+ * @param aRoot Root of the configuration file document.
+ * @return LIst of page type based configurations.
+ */
+ private List<PageTypeConfig> parsePageTypeConfigs(Element aRoot) {
List<PageTypeConfig> configs = new ArrayList<PageTypeConfig>();
- for (Iterator i = root.elementIterator(ELEM_TYPE); i.hasNext(); ) {
- Element url = (Element)i.next();
+ for (Iterator i = aRoot.elementIterator(ELEM_TYPE); i.hasNext();) {
+ Element url = (Element) i.next();
PageTypeConfig config = parsePageTypeConfig(url);
configs.add(config);
}
return configs;
}
-
- private UrlConfig parseUrlConfig(Element aUrlElem) {
+
+ /**
+ * Parses a URL-based configuration.
+ * @param aUrlElem Configuration element.
+ * @return Configuration.
+ */
+ private UrlConfig parseUrlConfig(Element aUrlElem) {
String pattern = aUrlElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aUrlElem);
return new UrlConfig(pattern, request);
}
-
- private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
+
+ /**
+ * Parses a page type based configuration.
+ * @param aTypeElem Configuration element.
+ * @return Configuration.
+ */
+ private PageTypeConfig parsePageTypeConfig(Element aTypeElem) {
String pattern = aTypeElem.elementText(ELEM_PATTERN);
PageRequest request = parseRequestConfig(aTypeElem);
return new PageTypeConfig(pattern, request);
}
/**
- * @param aUrlElem
- * @return
+ * Parses a request configuration describing how to execute requests.
+ * @param aElem Configuration element.
+ * @return Page request.
*/
- private PageRequest parseRequestConfig(Element aUrlElem) {
- String method = aUrlElem.elementText(ELEM_METHOD);
- String xslt = aUrlElem.elementText(ELEM_XSLT);
- List<NameValuePair> params = new ArrayList<NameValuePair>();
- for (Iterator i = aUrlElem.elementIterator(ELEM_PARAM); i.hasNext(); ) {
- Element paramElem = (Element)i.next();
- NameValuePair param = parseParameter(paramElem);
- params.add(param);
- }
-
+ private PageRequest parseRequestConfig(Element aElem) {
+ String method = aElem.elementText(ELEM_METHOD);
+ String xslt = aElem.elementText(ELEM_XSLT);
+ List<NameValuePair> params = parseNameValuePairs(aElem, ELEM_PARAM);
+ List<NameValuePair> headers = parseNameValuePairs(aElem, ELEM_HEADER);
+
NameValuePair[] paramsArray = params.toArray(new NameValuePair[0]);
- PageRequest request;
- if ( METHOD_POST.equals(method)) {
- request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
- }
- else if ( METHOD_GET.equals(method) || method == null ){
- request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, xslt, _os);
- } else {
- throw new RuntimeException("Unknown request method '" + method + "'. Only " +
- METHOD_GET + " and " + METHOD_POST + " are supported");
+ NameValuePair[] headersArray = headers.toArray(new NameValuePair[0]);
+ PageRequest request;
+ if (METHOD_POST.equals(method)) {
+ request = new PostPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
+ xslt, _transformer);
+ } else if (METHOD_GET.equals(method) || method == null) {
+ request = new GetPageRequest(MAX_TRIES, MAX_DELAY, paramsArray, headersArray,
+ xslt, _transformer);
+ } else {
+ throw new RuntimeException("Unknown request method '" + method
+ + "'. Only " + METHOD_GET + " and " + METHOD_POST
+ + " are supported");
}
return request;
}
-
- private NameValuePair parseParameter(Element aParam) {
+
+ /**
+ * @param aElem
+ * @return
+ */
+ private List<NameValuePair> parseNameValuePairs(Element aElem, String aElemName) {
+ List<NameValuePair> headers = new ArrayList<NameValuePair>();
+ for (Iterator i = aElem.elementIterator(aElemName); i.hasNext();) {
+ Element paramElem = (Element) i.next();
+ NameValuePair header = parseParameter(paramElem);
+ headers.add(header);
+ }
+ return headers;
+ }
+
+ /**
+ * Parses a parameter definition.
+ * @param aParam Parameter.
+ * @return Name value pair describing a parameter.
+ */
+ private NameValuePair parseParameter(Element aParam) {
String name = aParam.attributeValue(AT_NAME);
String value = aParam.attributeValue(AT_VALUE);
return new NameValuePair(name, value);